{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012738853503184714, "grad_norm": 27.578015588062595, "learning_rate": 0.0, "loss": 2.2484302520751953, "step": 1 }, { "epoch": 0.025477707006369428, "grad_norm": 27.351013009442298, "learning_rate": 4.1666666666666667e-07, "loss": 2.215416193008423, "step": 2 }, { "epoch": 0.03821656050955414, "grad_norm": 29.00648618644296, "learning_rate": 8.333333333333333e-07, "loss": 2.1969661712646484, "step": 3 }, { "epoch": 0.050955414012738856, "grad_norm": 24.279613305984917, "learning_rate": 1.25e-06, "loss": 2.1304638385772705, "step": 4 }, { "epoch": 0.06369426751592357, "grad_norm": 24.548939602077972, "learning_rate": 1.6666666666666667e-06, "loss": 2.321625232696533, "step": 5 }, { "epoch": 0.07643312101910828, "grad_norm": 25.979670653733457, "learning_rate": 2.0833333333333334e-06, "loss": 2.1303162574768066, "step": 6 }, { "epoch": 0.08917197452229299, "grad_norm": 18.68650885616665, "learning_rate": 2.5e-06, "loss": 1.8076802492141724, "step": 7 }, { "epoch": 0.10191082802547771, "grad_norm": 16.7812576898942, "learning_rate": 2.916666666666667e-06, "loss": 1.7900886535644531, "step": 8 }, { "epoch": 0.11464968152866242, "grad_norm": 14.18738912625846, "learning_rate": 3.3333333333333333e-06, "loss": 1.7915903329849243, "step": 9 }, { "epoch": 0.12738853503184713, "grad_norm": 14.116799195872652, "learning_rate": 3.7500000000000005e-06, "loss": 1.8171511888504028, "step": 10 }, { "epoch": 0.14012738853503184, "grad_norm": 10.442018372124066, "learning_rate": 4.166666666666667e-06, "loss": 1.7455570697784424, "step": 11 }, { "epoch": 0.15286624203821655, "grad_norm": 9.192045468171578, "learning_rate": 4.583333333333333e-06, "loss": 1.6337864398956299, "step": 12 }, { "epoch": 0.16560509554140126, "grad_norm": 8.260571303853565, "learning_rate": 5e-06, "loss": 1.778015375137329, "step": 13 }, { "epoch": 0.17834394904458598, "grad_norm": 6.460613846297531, "learning_rate": 5.416666666666667e-06, "loss": 1.712306022644043, "step": 14 }, { "epoch": 0.1910828025477707, "grad_norm": 6.843351540555302, "learning_rate": 5.833333333333334e-06, "loss": 1.5768513679504395, "step": 15 }, { "epoch": 0.20382165605095542, "grad_norm": 6.133058262409406, "learning_rate": 6.25e-06, "loss": 1.5611257553100586, "step": 16 }, { "epoch": 0.21656050955414013, "grad_norm": 5.180005658869054, "learning_rate": 6.666666666666667e-06, "loss": 1.5967652797698975, "step": 17 }, { "epoch": 0.22929936305732485, "grad_norm": 5.305167134267678, "learning_rate": 7.083333333333335e-06, "loss": 1.364829659461975, "step": 18 }, { "epoch": 0.24203821656050956, "grad_norm": 5.355870721587038, "learning_rate": 7.500000000000001e-06, "loss": 1.6430319547653198, "step": 19 }, { "epoch": 0.25477707006369427, "grad_norm": 5.0292720888884075, "learning_rate": 7.916666666666667e-06, "loss": 1.5467270612716675, "step": 20 }, { "epoch": 0.267515923566879, "grad_norm": 4.948954166107489, "learning_rate": 8.333333333333334e-06, "loss": 1.5446631908416748, "step": 21 }, { "epoch": 0.2802547770700637, "grad_norm": 4.859436510097199, "learning_rate": 8.750000000000001e-06, "loss": 1.4481780529022217, "step": 22 }, { "epoch": 0.2929936305732484, "grad_norm": 5.3831221005725896, "learning_rate": 9.166666666666666e-06, "loss": 1.4933228492736816, "step": 23 }, { "epoch": 0.3057324840764331, "grad_norm": 4.473608276014855, "learning_rate": 9.583333333333335e-06, "loss": 1.5807710886001587, "step": 24 }, { "epoch": 0.3184713375796178, "grad_norm": 4.109425789809634, "learning_rate": 1e-05, "loss": 1.349104404449463, "step": 25 }, { "epoch": 0.33121019108280253, "grad_norm": 4.634192438556494, "learning_rate": 9.999456158087994e-06, "loss": 1.4354019165039062, "step": 26 }, { "epoch": 0.34394904458598724, "grad_norm": 5.0726515873395, "learning_rate": 9.997824750657586e-06, "loss": 1.566201090812683, "step": 27 }, { "epoch": 0.35668789808917195, "grad_norm": 4.8805336123469205, "learning_rate": 9.995106132599869e-06, "loss": 1.411285161972046, "step": 28 }, { "epoch": 0.36942675159235666, "grad_norm": 4.859867988307453, "learning_rate": 9.99130089531422e-06, "loss": 1.2867789268493652, "step": 29 }, { "epoch": 0.3821656050955414, "grad_norm": 4.432994127396081, "learning_rate": 9.98640986657965e-06, "loss": 1.5999436378479004, "step": 30 }, { "epoch": 0.39490445859872614, "grad_norm": 4.40223600447386, "learning_rate": 9.980434110374725e-06, "loss": 1.4318150281906128, "step": 31 }, { "epoch": 0.40764331210191085, "grad_norm": 4.902591623548149, "learning_rate": 9.973374926646117e-06, "loss": 1.607371211051941, "step": 32 }, { "epoch": 0.42038216560509556, "grad_norm": 4.909609907293681, "learning_rate": 9.965233851025816e-06, "loss": 1.443784236907959, "step": 33 }, { "epoch": 0.43312101910828027, "grad_norm": 4.456375484305202, "learning_rate": 9.956012654497073e-06, "loss": 1.570559024810791, "step": 34 }, { "epoch": 0.445859872611465, "grad_norm": 4.599861555148005, "learning_rate": 9.945713343009154e-06, "loss": 1.548865556716919, "step": 35 }, { "epoch": 0.4585987261146497, "grad_norm": 4.315411715741126, "learning_rate": 9.934338157040953e-06, "loss": 1.4340442419052124, "step": 36 }, { "epoch": 0.4713375796178344, "grad_norm": 4.598194925817704, "learning_rate": 9.921889571113629e-06, "loss": 1.5494410991668701, "step": 37 }, { "epoch": 0.4840764331210191, "grad_norm": 4.243095090396253, "learning_rate": 9.90837029325229e-06, "loss": 1.4130847454071045, "step": 38 }, { "epoch": 0.4968152866242038, "grad_norm": 4.980649623484297, "learning_rate": 9.893783264396903e-06, "loss": 1.4265036582946777, "step": 39 }, { "epoch": 0.5095541401273885, "grad_norm": 5.789533896785179, "learning_rate": 9.878131657762535e-06, "loss": 1.4373618364334106, "step": 40 }, { "epoch": 0.5222929936305732, "grad_norm": 4.658455364896436, "learning_rate": 9.861418878149056e-06, "loss": 1.4085681438446045, "step": 41 }, { "epoch": 0.535031847133758, "grad_norm": 5.324025858102516, "learning_rate": 9.843648561200476e-06, "loss": 1.452268123626709, "step": 42 }, { "epoch": 0.5477707006369427, "grad_norm": 4.692010278942193, "learning_rate": 9.82482457261405e-06, "loss": 1.6110832691192627, "step": 43 }, { "epoch": 0.5605095541401274, "grad_norm": 4.64177391127698, "learning_rate": 9.80495100729936e-06, "loss": 1.4959537982940674, "step": 44 }, { "epoch": 0.5732484076433121, "grad_norm": 4.196745696549577, "learning_rate": 9.784032188487507e-06, "loss": 1.4369564056396484, "step": 45 }, { "epoch": 0.5859872611464968, "grad_norm": 4.846501722779206, "learning_rate": 9.762072666790658e-06, "loss": 1.5659615993499756, "step": 46 }, { "epoch": 0.5987261146496815, "grad_norm": 4.475752207148854, "learning_rate": 9.73907721921212e-06, "loss": 1.6203088760375977, "step": 47 }, { "epoch": 0.6114649681528662, "grad_norm": 5.622443868901302, "learning_rate": 9.715050848107167e-06, "loss": 1.4394254684448242, "step": 48 }, { "epoch": 0.6242038216560509, "grad_norm": 5.282103052112391, "learning_rate": 9.689998780094839e-06, "loss": 1.3903216123580933, "step": 49 }, { "epoch": 0.6369426751592356, "grad_norm": 4.399414503844804, "learning_rate": 9.663926464920959e-06, "loss": 1.4829354286193848, "step": 50 }, { "epoch": 0.6496815286624203, "grad_norm": 4.523444723393011, "learning_rate": 9.636839574272623e-06, "loss": 1.5274395942687988, "step": 51 }, { "epoch": 0.6624203821656051, "grad_norm": 4.232538410435911, "learning_rate": 9.608744000544392e-06, "loss": 1.4694490432739258, "step": 52 }, { "epoch": 0.6751592356687898, "grad_norm": 4.354507161235457, "learning_rate": 9.579645855556481e-06, "loss": 1.2353503704071045, "step": 53 }, { "epoch": 0.6878980891719745, "grad_norm": 4.9180556110442595, "learning_rate": 9.54955146922521e-06, "loss": 1.4008901119232178, "step": 54 }, { "epoch": 0.7006369426751592, "grad_norm": 4.970650879718249, "learning_rate": 9.51846738818602e-06, "loss": 1.3539741039276123, "step": 55 }, { "epoch": 0.7133757961783439, "grad_norm": 4.837000418043291, "learning_rate": 9.48640037436934e-06, "loss": 1.3163714408874512, "step": 56 }, { "epoch": 0.7261146496815286, "grad_norm": 3.9679039776262064, "learning_rate": 9.453357403529609e-06, "loss": 1.3809059858322144, "step": 57 }, { "epoch": 0.7388535031847133, "grad_norm": 4.462452198138723, "learning_rate": 9.419345663727805e-06, "loss": 1.458146572113037, "step": 58 }, { "epoch": 0.7515923566878981, "grad_norm": 4.402982081383035, "learning_rate": 9.38437255376777e-06, "loss": 1.329193115234375, "step": 59 }, { "epoch": 0.7643312101910829, "grad_norm": 4.377947135685406, "learning_rate": 9.348445681586703e-06, "loss": 1.4500741958618164, "step": 60 }, { "epoch": 0.7770700636942676, "grad_norm": 4.198026205959271, "learning_rate": 9.31157286260014e-06, "loss": 1.4562097787857056, "step": 61 }, { "epoch": 0.7898089171974523, "grad_norm": 4.945285199299728, "learning_rate": 9.273762118001837e-06, "loss": 1.3661162853240967, "step": 62 }, { "epoch": 0.802547770700637, "grad_norm": 3.9573822911468266, "learning_rate": 9.235021673018849e-06, "loss": 1.3168445825576782, "step": 63 }, { "epoch": 0.8152866242038217, "grad_norm": 4.566194937738093, "learning_rate": 9.195359955122244e-06, "loss": 1.3281530141830444, "step": 64 }, { "epoch": 0.8280254777070064, "grad_norm": 4.657547711627972, "learning_rate": 9.15478559219382e-06, "loss": 1.3520253896713257, "step": 65 }, { "epoch": 0.8407643312101911, "grad_norm": 4.931346013168586, "learning_rate": 9.113307410649222e-06, "loss": 1.4982115030288696, "step": 66 }, { "epoch": 0.8535031847133758, "grad_norm": 4.205958958323587, "learning_rate": 9.070934433517872e-06, "loss": 1.402880311012268, "step": 67 }, { "epoch": 0.8662420382165605, "grad_norm": 4.265262844911349, "learning_rate": 9.027675878480131e-06, "loss": 1.4359843730926514, "step": 68 }, { "epoch": 0.8789808917197452, "grad_norm": 4.174081063602911, "learning_rate": 8.983541155862114e-06, "loss": 1.4095585346221924, "step": 69 }, { "epoch": 0.89171974522293, "grad_norm": 4.300497916224527, "learning_rate": 8.938539866588593e-06, "loss": 1.3254384994506836, "step": 70 }, { "epoch": 0.9044585987261147, "grad_norm": 4.321856820928802, "learning_rate": 8.892681800094447e-06, "loss": 1.3895121812820435, "step": 71 }, { "epoch": 0.9171974522292994, "grad_norm": 4.353418264893862, "learning_rate": 8.845976932195104e-06, "loss": 1.5136423110961914, "step": 72 }, { "epoch": 0.9299363057324841, "grad_norm": 4.509525726327509, "learning_rate": 8.798435422916425e-06, "loss": 1.560758352279663, "step": 73 }, { "epoch": 0.9426751592356688, "grad_norm": 4.14002546470194, "learning_rate": 8.750067614284534e-06, "loss": 1.2931057214736938, "step": 74 }, { "epoch": 0.9554140127388535, "grad_norm": 4.1809249012997345, "learning_rate": 8.700884028076042e-06, "loss": 1.5124843120574951, "step": 75 }, { "epoch": 0.9681528662420382, "grad_norm": 4.293885955875831, "learning_rate": 8.650895363529172e-06, "loss": 1.317713737487793, "step": 76 }, { "epoch": 0.9808917197452229, "grad_norm": 4.631688789038691, "learning_rate": 8.600112495016289e-06, "loss": 1.3039919137954712, "step": 77 }, { "epoch": 0.9936305732484076, "grad_norm": 4.27759477922895, "learning_rate": 8.548546469678311e-06, "loss": 1.495795488357544, "step": 78 }, { "epoch": 1.0, "grad_norm": 5.956623944392664, "learning_rate": 8.496208505021572e-06, "loss": 1.429541826248169, "step": 79 }, { "epoch": 1.0127388535031847, "grad_norm": 4.45796601634621, "learning_rate": 8.443109986477574e-06, "loss": 0.8995598554611206, "step": 80 }, { "epoch": 1.0254777070063694, "grad_norm": 4.500661347515663, "learning_rate": 8.389262464926256e-06, "loss": 0.63990318775177, "step": 81 }, { "epoch": 1.0382165605095541, "grad_norm": 3.8748063820496257, "learning_rate": 8.334677654183254e-06, "loss": 0.6055729985237122, "step": 82 }, { "epoch": 1.0509554140127388, "grad_norm": 3.4628880356772096, "learning_rate": 8.279367428451703e-06, "loss": 0.7356538772583008, "step": 83 }, { "epoch": 1.0636942675159236, "grad_norm": 3.6516848444686265, "learning_rate": 8.223343819739164e-06, "loss": 0.692323625087738, "step": 84 }, { "epoch": 1.0764331210191083, "grad_norm": 3.968197707946131, "learning_rate": 8.166619015240236e-06, "loss": 0.6772887706756592, "step": 85 }, { "epoch": 1.089171974522293, "grad_norm": 3.845941294941666, "learning_rate": 8.109205354685367e-06, "loss": 0.5514630675315857, "step": 86 }, { "epoch": 1.1019108280254777, "grad_norm": 3.774618366335066, "learning_rate": 8.051115327656538e-06, "loss": 0.6684471964836121, "step": 87 }, { "epoch": 1.1146496815286624, "grad_norm": 3.7047992437252, "learning_rate": 7.992361570870289e-06, "loss": 0.5766518712043762, "step": 88 }, { "epoch": 1.127388535031847, "grad_norm": 4.214676734133472, "learning_rate": 7.932956865428792e-06, "loss": 0.5921903848648071, "step": 89 }, { "epoch": 1.1401273885350318, "grad_norm": 4.387324313211908, "learning_rate": 7.872914134039485e-06, "loss": 0.592995285987854, "step": 90 }, { "epoch": 1.1528662420382165, "grad_norm": 4.005865244271663, "learning_rate": 7.812246438203905e-06, "loss": 0.5482683181762695, "step": 91 }, { "epoch": 1.1656050955414012, "grad_norm": 4.636384134136274, "learning_rate": 7.750966975376328e-06, "loss": 0.6826972365379333, "step": 92 }, { "epoch": 1.178343949044586, "grad_norm": 4.6953248944517245, "learning_rate": 7.689089076092851e-06, "loss": 0.5954027771949768, "step": 93 }, { "epoch": 1.1910828025477707, "grad_norm": 4.80616798771938, "learning_rate": 7.626626201071494e-06, "loss": 0.6095083355903625, "step": 94 }, { "epoch": 1.2038216560509554, "grad_norm": 4.48252749973364, "learning_rate": 7.563591938284012e-06, "loss": 0.709877610206604, "step": 95 }, { "epoch": 1.21656050955414, "grad_norm": 4.596613533967055, "learning_rate": 7.500000000000001e-06, "loss": 0.5784502029418945, "step": 96 }, { "epoch": 1.2292993630573248, "grad_norm": 4.76402532258561, "learning_rate": 7.4358642198039835e-06, "loss": 0.5837893486022949, "step": 97 }, { "epoch": 1.2420382165605095, "grad_norm": 4.440144626730792, "learning_rate": 7.371198549586091e-06, "loss": 0.7246421575546265, "step": 98 }, { "epoch": 1.2547770700636942, "grad_norm": 4.1554626239444605, "learning_rate": 7.306017056507018e-06, "loss": 0.5735586285591125, "step": 99 }, { "epoch": 1.267515923566879, "grad_norm": 5.004892398076429, "learning_rate": 7.240333919937893e-06, "loss": 0.5463488101959229, "step": 100 }, { "epoch": 1.2802547770700636, "grad_norm": 5.088476554254515, "learning_rate": 7.174163428375748e-06, "loss": 0.5633252859115601, "step": 101 }, { "epoch": 1.2929936305732483, "grad_norm": 5.118792774795437, "learning_rate": 7.107519976335241e-06, "loss": 0.5037230253219604, "step": 102 }, { "epoch": 1.305732484076433, "grad_norm": 4.75623015993911, "learning_rate": 7.040418061217325e-06, "loss": 0.5365867614746094, "step": 103 }, { "epoch": 1.3184713375796178, "grad_norm": 5.157812619262671, "learning_rate": 6.972872280155528e-06, "loss": 0.6433064937591553, "step": 104 }, { "epoch": 1.3312101910828025, "grad_norm": 5.148365945239476, "learning_rate": 6.9048973268405375e-06, "loss": 0.6543390154838562, "step": 105 }, { "epoch": 1.3439490445859872, "grad_norm": 4.5311604864334125, "learning_rate": 6.836507988323785e-06, "loss": 0.6132720708847046, "step": 106 }, { "epoch": 1.356687898089172, "grad_norm": 4.88971282799509, "learning_rate": 6.767719141800718e-06, "loss": 0.6079248189926147, "step": 107 }, { "epoch": 1.3694267515923566, "grad_norm": 4.695137801905107, "learning_rate": 6.698545751374465e-06, "loss": 0.6232650279998779, "step": 108 }, { "epoch": 1.3821656050955413, "grad_norm": 4.267620300562347, "learning_rate": 6.629002864800589e-06, "loss": 0.5911256074905396, "step": 109 }, { "epoch": 1.394904458598726, "grad_norm": 5.30166521900121, "learning_rate": 6.55910561021365e-06, "loss": 0.6404790282249451, "step": 110 }, { "epoch": 1.4076433121019107, "grad_norm": 4.964813016784396, "learning_rate": 6.488869192836279e-06, "loss": 0.6661736965179443, "step": 111 }, { "epoch": 1.4203821656050954, "grad_norm": 5.042827042141295, "learning_rate": 6.418308891671484e-06, "loss": 0.5621084570884705, "step": 112 }, { "epoch": 1.4331210191082802, "grad_norm": 4.554306311318436, "learning_rate": 6.347440056178904e-06, "loss": 0.5913956165313721, "step": 113 }, { "epoch": 1.4458598726114649, "grad_norm": 3.999260338697589, "learning_rate": 6.27627810293574e-06, "loss": 0.5895659327507019, "step": 114 }, { "epoch": 1.4585987261146496, "grad_norm": 4.488817511346429, "learning_rate": 6.204838512283073e-06, "loss": 0.6066327691078186, "step": 115 }, { "epoch": 1.4713375796178343, "grad_norm": 4.2048895000167725, "learning_rate": 6.133136824958334e-06, "loss": 0.579125165939331, "step": 116 }, { "epoch": 1.484076433121019, "grad_norm": 4.865801929274413, "learning_rate": 6.061188638714616e-06, "loss": 0.5661747455596924, "step": 117 }, { "epoch": 1.4968152866242037, "grad_norm": 4.0216175803478365, "learning_rate": 5.989009604927587e-06, "loss": 0.5881543159484863, "step": 118 }, { "epoch": 1.5095541401273884, "grad_norm": 4.672593821116511, "learning_rate": 5.916615425190744e-06, "loss": 0.6381370425224304, "step": 119 }, { "epoch": 1.5222929936305731, "grad_norm": 4.579578448838088, "learning_rate": 5.844021847899735e-06, "loss": 0.5820121765136719, "step": 120 }, { "epoch": 1.5350318471337578, "grad_norm": 4.9782337341335845, "learning_rate": 5.771244664826512e-06, "loss": 0.5244691371917725, "step": 121 }, { "epoch": 1.5477707006369426, "grad_norm": 4.615280693095074, "learning_rate": 5.698299707684031e-06, "loss": 0.6596621870994568, "step": 122 }, { "epoch": 1.5605095541401273, "grad_norm": 4.1315152129695205, "learning_rate": 5.6252028446822805e-06, "loss": 0.6240249872207642, "step": 123 }, { "epoch": 1.573248407643312, "grad_norm": 4.34694030117767, "learning_rate": 5.55196997707635e-06, "loss": 0.6121684312820435, "step": 124 }, { "epoch": 1.5859872611464967, "grad_norm": 4.736014683349439, "learning_rate": 5.478617035707337e-06, "loss": 0.581444263458252, "step": 125 }, { "epoch": 1.5987261146496814, "grad_norm": 4.42473315063519, "learning_rate": 5.4051599775368e-06, "loss": 0.5702801942825317, "step": 126 }, { "epoch": 1.611464968152866, "grad_norm": 4.723043711831375, "learning_rate": 5.33161478217552e-06, "loss": 0.643683671951294, "step": 127 }, { "epoch": 1.6242038216560508, "grad_norm": 4.615535634313775, "learning_rate": 5.257997448407366e-06, "loss": 0.6429088115692139, "step": 128 }, { "epoch": 1.6369426751592355, "grad_norm": 3.943237517267742, "learning_rate": 5.184323990708959e-06, "loss": 0.5036097764968872, "step": 129 }, { "epoch": 1.6496815286624202, "grad_norm": 5.369249891502365, "learning_rate": 5.110610435765935e-06, "loss": 0.6377817392349243, "step": 130 }, { "epoch": 1.662420382165605, "grad_norm": 4.645492978424057, "learning_rate": 5.0368728189865624e-06, "loss": 0.5092718601226807, "step": 131 }, { "epoch": 1.6751592356687897, "grad_norm": 4.9878218164552255, "learning_rate": 4.9631271810134375e-06, "loss": 0.6005362868309021, "step": 132 }, { "epoch": 1.6878980891719744, "grad_norm": 4.710856517549427, "learning_rate": 4.8893895642340665e-06, "loss": 0.4808087944984436, "step": 133 }, { "epoch": 1.700636942675159, "grad_norm": 4.962556354741984, "learning_rate": 4.815676009291044e-06, "loss": 0.6739586591720581, "step": 134 }, { "epoch": 1.7133757961783438, "grad_norm": 4.864043235726367, "learning_rate": 4.742002551592635e-06, "loss": 0.5722870826721191, "step": 135 }, { "epoch": 1.7261146496815285, "grad_norm": 5.805499130195261, "learning_rate": 4.668385217824482e-06, "loss": 0.5560994148254395, "step": 136 }, { "epoch": 1.7388535031847132, "grad_norm": 4.3754614924647734, "learning_rate": 4.594840022463201e-06, "loss": 0.6376844644546509, "step": 137 }, { "epoch": 1.7515923566878981, "grad_norm": 4.6276825029066515, "learning_rate": 4.5213829642926635e-06, "loss": 0.5070189237594604, "step": 138 }, { "epoch": 1.7643312101910829, "grad_norm": 5.058486321341029, "learning_rate": 4.4480300229236525e-06, "loss": 0.6301469206809998, "step": 139 }, { "epoch": 1.7770700636942676, "grad_norm": 4.631581699502946, "learning_rate": 4.374797155317721e-06, "loss": 0.5686060190200806, "step": 140 }, { "epoch": 1.7898089171974523, "grad_norm": 4.839930377645928, "learning_rate": 4.30170029231597e-06, "loss": 0.5702610015869141, "step": 141 }, { "epoch": 1.802547770700637, "grad_norm": 4.634251405852573, "learning_rate": 4.228755335173488e-06, "loss": 0.5375156402587891, "step": 142 }, { "epoch": 1.8152866242038217, "grad_norm": 5.224378872859397, "learning_rate": 4.155978152100266e-06, "loss": 0.588652491569519, "step": 143 }, { "epoch": 1.8280254777070064, "grad_norm": 5.243980650196693, "learning_rate": 4.0833845748092586e-06, "loss": 0.6560136079788208, "step": 144 }, { "epoch": 1.8407643312101911, "grad_norm": 4.870640612365541, "learning_rate": 4.010990395072414e-06, "loss": 0.5707780718803406, "step": 145 }, { "epoch": 1.8535031847133758, "grad_norm": 4.896770007248889, "learning_rate": 3.938811361285386e-06, "loss": 0.578855574131012, "step": 146 }, { "epoch": 1.8662420382165605, "grad_norm": 5.621832570155973, "learning_rate": 3.866863175041666e-06, "loss": 0.7337894439697266, "step": 147 }, { "epoch": 1.8789808917197452, "grad_norm": 4.788974930837312, "learning_rate": 3.7951614877169285e-06, "loss": 0.6584663391113281, "step": 148 }, { "epoch": 1.89171974522293, "grad_norm": 5.197175599878351, "learning_rate": 3.7237218970642624e-06, "loss": 0.5132451057434082, "step": 149 }, { "epoch": 1.9044585987261147, "grad_norm": 4.494637523697752, "learning_rate": 3.6525599438210956e-06, "loss": 0.5699691772460938, "step": 150 }, { "epoch": 1.9171974522292994, "grad_norm": 4.436597339850294, "learning_rate": 3.5816911083285165e-06, "loss": 0.6117175817489624, "step": 151 }, { "epoch": 1.929936305732484, "grad_norm": 4.71698618164443, "learning_rate": 3.511130807163724e-06, "loss": 0.48447686433792114, "step": 152 }, { "epoch": 1.9426751592356688, "grad_norm": 4.586270355395819, "learning_rate": 3.440894389786352e-06, "loss": 0.5775331854820251, "step": 153 }, { "epoch": 1.9554140127388535, "grad_norm": 5.467603736362664, "learning_rate": 3.370997135199413e-06, "loss": 0.6822047829627991, "step": 154 }, { "epoch": 1.9681528662420382, "grad_norm": 5.092809942708443, "learning_rate": 3.3014542486255365e-06, "loss": 0.620025098323822, "step": 155 }, { "epoch": 1.980891719745223, "grad_norm": 4.782057480529959, "learning_rate": 3.2322808581992825e-06, "loss": 0.6051990985870361, "step": 156 }, { "epoch": 1.9936305732484076, "grad_norm": 5.07119310501042, "learning_rate": 3.1634920116762175e-06, "loss": 0.5013089776039124, "step": 157 }, { "epoch": 2.0, "grad_norm": 5.834245362327659, "learning_rate": 3.0951026731594634e-06, "loss": 0.41039198637008667, "step": 158 }, { "epoch": 2.0127388535031847, "grad_norm": 2.970713570403218, "learning_rate": 3.0271277198444737e-06, "loss": 0.14488917589187622, "step": 159 }, { "epoch": 2.0254777070063694, "grad_norm": 3.3900669209478917, "learning_rate": 2.9595819387826753e-06, "loss": 0.17139403522014618, "step": 160 }, { "epoch": 2.038216560509554, "grad_norm": 3.148172373199878, "learning_rate": 2.89248002366476e-06, "loss": 0.13938947021961212, "step": 161 }, { "epoch": 2.050955414012739, "grad_norm": 3.292222772844883, "learning_rate": 2.8258365716242543e-06, "loss": 0.19142913818359375, "step": 162 }, { "epoch": 2.0636942675159236, "grad_norm": 5.062552654446493, "learning_rate": 2.7596660800621076e-06, "loss": 0.32667019963264465, "step": 163 }, { "epoch": 2.0764331210191083, "grad_norm": 2.9195663792104853, "learning_rate": 2.6939829434929834e-06, "loss": 0.16923490166664124, "step": 164 }, { "epoch": 2.089171974522293, "grad_norm": 2.660735105353199, "learning_rate": 2.6288014504139104e-06, "loss": 0.16544359922409058, "step": 165 }, { "epoch": 2.1019108280254777, "grad_norm": 2.9195377278173438, "learning_rate": 2.5641357801960186e-06, "loss": 0.13166563212871552, "step": 166 }, { "epoch": 2.1146496815286624, "grad_norm": 2.7115850726819133, "learning_rate": 2.5000000000000015e-06, "loss": 0.1502484679222107, "step": 167 }, { "epoch": 2.127388535031847, "grad_norm": 2.5246541477672957, "learning_rate": 2.4364080617159885e-06, "loss": 0.12001603841781616, "step": 168 }, { "epoch": 2.140127388535032, "grad_norm": 2.906306753932353, "learning_rate": 2.373373798928507e-06, "loss": 0.16388744115829468, "step": 169 }, { "epoch": 2.1528662420382165, "grad_norm": 3.3313464695860855, "learning_rate": 2.310910923907149e-06, "loss": 0.17085227370262146, "step": 170 }, { "epoch": 2.1656050955414012, "grad_norm": 3.537696001337278, "learning_rate": 2.249033024623672e-06, "loss": 0.1649709939956665, "step": 171 }, { "epoch": 2.178343949044586, "grad_norm": 3.0477614078497157, "learning_rate": 2.187753561796097e-06, "loss": 0.13725437223911285, "step": 172 }, { "epoch": 2.1910828025477707, "grad_norm": 3.108829906302373, "learning_rate": 2.127085865960516e-06, "loss": 0.14223095774650574, "step": 173 }, { "epoch": 2.2038216560509554, "grad_norm": 3.188987721207745, "learning_rate": 2.0670431345712092e-06, "loss": 0.1432873010635376, "step": 174 }, { "epoch": 2.21656050955414, "grad_norm": 3.5488199597897045, "learning_rate": 2.0076384291297134e-06, "loss": 0.1355983018875122, "step": 175 }, { "epoch": 2.229299363057325, "grad_norm": 2.9979876656948483, "learning_rate": 1.9488846723434646e-06, "loss": 0.13247933983802795, "step": 176 }, { "epoch": 2.2420382165605095, "grad_norm": 3.443337367597467, "learning_rate": 1.890794645314633e-06, "loss": 0.1308836191892624, "step": 177 }, { "epoch": 2.254777070063694, "grad_norm": 4.121646470867133, "learning_rate": 1.8333809847597644e-06, "loss": 0.15963426232337952, "step": 178 }, { "epoch": 2.267515923566879, "grad_norm": 4.118828059264668, "learning_rate": 1.7766561802608374e-06, "loss": 0.14805136620998383, "step": 179 }, { "epoch": 2.2802547770700636, "grad_norm": 3.9708198011551166, "learning_rate": 1.7206325715483003e-06, "loss": 0.12024472653865814, "step": 180 }, { "epoch": 2.2929936305732483, "grad_norm": 3.439106672469071, "learning_rate": 1.665322345816746e-06, "loss": 0.11454702913761139, "step": 181 }, { "epoch": 2.305732484076433, "grad_norm": 3.4010452876916615, "learning_rate": 1.6107375350737437e-06, "loss": 0.10992666333913803, "step": 182 }, { "epoch": 2.3184713375796178, "grad_norm": 3.5752577926580975, "learning_rate": 1.556890013522428e-06, "loss": 0.09631110727787018, "step": 183 }, { "epoch": 2.3312101910828025, "grad_norm": 3.8387220728977343, "learning_rate": 1.50379149497843e-06, "loss": 0.14856451749801636, "step": 184 }, { "epoch": 2.343949044585987, "grad_norm": 3.444989482317406, "learning_rate": 1.4514535303216893e-06, "loss": 0.09073778241872787, "step": 185 }, { "epoch": 2.356687898089172, "grad_norm": 3.2622590339488124, "learning_rate": 1.3998875049837141e-06, "loss": 0.10596369206905365, "step": 186 }, { "epoch": 2.3694267515923566, "grad_norm": 4.072722677232836, "learning_rate": 1.3491046364708294e-06, "loss": 0.1488298773765564, "step": 187 }, { "epoch": 2.3821656050955413, "grad_norm": 4.114774744144093, "learning_rate": 1.2991159719239581e-06, "loss": 0.13143031299114227, "step": 188 }, { "epoch": 2.394904458598726, "grad_norm": 3.792643277657603, "learning_rate": 1.249932385715467e-06, "loss": 0.12935219705104828, "step": 189 }, { "epoch": 2.4076433121019107, "grad_norm": 3.6041653995445, "learning_rate": 1.2015645770835765e-06, "loss": 0.10895463824272156, "step": 190 }, { "epoch": 2.4203821656050954, "grad_norm": 3.419036474508468, "learning_rate": 1.1540230678048969e-06, "loss": 0.11770664900541306, "step": 191 }, { "epoch": 2.43312101910828, "grad_norm": 3.8473062967203626, "learning_rate": 1.1073181999055538e-06, "loss": 0.12943175435066223, "step": 192 }, { "epoch": 2.445859872611465, "grad_norm": 4.213646564060963, "learning_rate": 1.0614601334114099e-06, "loss": 0.15990746021270752, "step": 193 }, { "epoch": 2.4585987261146496, "grad_norm": 3.292740607382361, "learning_rate": 1.016458844137887e-06, "loss": 0.0967484638094902, "step": 194 }, { "epoch": 2.4713375796178343, "grad_norm": 3.3587679937993675, "learning_rate": 9.723241215198692e-07, "loss": 0.09274256229400635, "step": 195 }, { "epoch": 2.484076433121019, "grad_norm": 3.415144877613833, "learning_rate": 9.290655664821296e-07, "loss": 0.12071307003498077, "step": 196 }, { "epoch": 2.4968152866242037, "grad_norm": 3.614520056467126, "learning_rate": 8.866925893507805e-07, "loss": 0.14337831735610962, "step": 197 }, { "epoch": 2.5095541401273884, "grad_norm": 3.1413281076463333, "learning_rate": 8.45214407806182e-07, "loss": 0.1311374008655548, "step": 198 }, { "epoch": 2.522292993630573, "grad_norm": 3.5634546960778963, "learning_rate": 8.046400448777575e-07, "loss": 0.12355434894561768, "step": 199 }, { "epoch": 2.535031847133758, "grad_norm": 3.55245812518791, "learning_rate": 7.649783269811523e-07, "loss": 0.11268627643585205, "step": 200 }, { "epoch": 2.5477707006369426, "grad_norm": 3.6047204962278205, "learning_rate": 7.26237881998163e-07, "loss": 0.1278030276298523, "step": 201 }, { "epoch": 2.5605095541401273, "grad_norm": 3.826082333377558, "learning_rate": 6.884271373998608e-07, "loss": 0.11588963866233826, "step": 202 }, { "epoch": 2.573248407643312, "grad_norm": 3.3477539285078044, "learning_rate": 6.515543184133e-07, "loss": 0.11168617010116577, "step": 203 }, { "epoch": 2.5859872611464967, "grad_norm": 3.40070063216114, "learning_rate": 6.156274462322292e-07, "loss": 0.14677459001541138, "step": 204 }, { "epoch": 2.5987261146496814, "grad_norm": 3.6867880675958333, "learning_rate": 5.806543362721945e-07, "loss": 0.1080314964056015, "step": 205 }, { "epoch": 2.611464968152866, "grad_norm": 3.50805046104141, "learning_rate": 5.466425964703914e-07, "loss": 0.10917598009109497, "step": 206 }, { "epoch": 2.624203821656051, "grad_norm": 3.744240792349818, "learning_rate": 5.135996256306619e-07, "loss": 0.10850804299116135, "step": 207 }, { "epoch": 2.6369426751592355, "grad_norm": 3.2636204792288184, "learning_rate": 4.815326118139813e-07, "loss": 0.23395496606826782, "step": 208 }, { "epoch": 2.6496815286624202, "grad_norm": 3.3320803212307895, "learning_rate": 4.5044853077479134e-07, "loss": 0.09678040444850922, "step": 209 }, { "epoch": 2.662420382165605, "grad_norm": 3.3126443611241005, "learning_rate": 4.203541444435211e-07, "loss": 0.09082137048244476, "step": 210 }, { "epoch": 2.6751592356687897, "grad_norm": 3.463640048859196, "learning_rate": 3.9125599945560866e-07, "loss": 0.12093393504619598, "step": 211 }, { "epoch": 2.6878980891719744, "grad_norm": 4.1484131801868225, "learning_rate": 3.631604257273774e-07, "loss": 0.12841008603572845, "step": 212 }, { "epoch": 2.700636942675159, "grad_norm": 3.4819962567564544, "learning_rate": 3.360735350790428e-07, "loss": 0.1454203575849533, "step": 213 }, { "epoch": 2.713375796178344, "grad_norm": 3.3217850732913834, "learning_rate": 3.100012199051627e-07, "loss": 0.12103286385536194, "step": 214 }, { "epoch": 2.7261146496815285, "grad_norm": 3.4551976218750706, "learning_rate": 2.8494915189283325e-07, "loss": 0.13519585132598877, "step": 215 }, { "epoch": 2.738853503184713, "grad_norm": 3.5046747113231738, "learning_rate": 2.6092278078788004e-07, "loss": 0.14792990684509277, "step": 216 }, { "epoch": 2.7515923566878984, "grad_norm": 4.057009589896516, "learning_rate": 2.3792733320934348e-07, "loss": 0.1573294997215271, "step": 217 }, { "epoch": 2.7643312101910826, "grad_norm": 3.485812762552763, "learning_rate": 2.1596781151249524e-07, "loss": 0.15241427719593048, "step": 218 }, { "epoch": 2.777070063694268, "grad_norm": 2.8563228482207395, "learning_rate": 1.9504899270064105e-07, "loss": 0.11122366786003113, "step": 219 }, { "epoch": 2.789808917197452, "grad_norm": 3.219771759621168, "learning_rate": 1.7517542738595071e-07, "loss": 0.11351308226585388, "step": 220 }, { "epoch": 2.802547770700637, "grad_norm": 3.4195554560904107, "learning_rate": 1.5635143879952575e-07, "loss": 0.1188071146607399, "step": 221 }, { "epoch": 2.8152866242038215, "grad_norm": 2.9103932269106374, "learning_rate": 1.3858112185094418e-07, "loss": 0.1164408028125763, "step": 222 }, { "epoch": 2.8280254777070066, "grad_norm": 3.6450799822214144, "learning_rate": 1.2186834223746612e-07, "loss": 0.12760576605796814, "step": 223 }, { "epoch": 2.840764331210191, "grad_norm": 3.3225130395239253, "learning_rate": 1.0621673560309798e-07, "loss": 0.11487654596567154, "step": 224 }, { "epoch": 2.853503184713376, "grad_norm": 3.2905886122232397, "learning_rate": 9.162970674771177e-08, "loss": 0.11246581375598907, "step": 225 }, { "epoch": 2.8662420382165603, "grad_norm": 3.504394500719592, "learning_rate": 7.81104288863721e-08, "loss": 0.09955516457557678, "step": 226 }, { "epoch": 2.8789808917197455, "grad_norm": 3.216564908375023, "learning_rate": 6.566184295904777e-08, "loss": 0.12330685555934906, "step": 227 }, { "epoch": 2.8917197452229297, "grad_norm": 3.607447433445088, "learning_rate": 5.4286656990847897e-08, "loss": 0.12849846482276917, "step": 228 }, { "epoch": 2.904458598726115, "grad_norm": 3.3244783180187425, "learning_rate": 4.398734550292716e-08, "loss": 0.11019767820835114, "step": 229 }, { "epoch": 2.917197452229299, "grad_norm": 3.1359379558395957, "learning_rate": 3.476614897418573e-08, "loss": 0.10802481323480606, "step": 230 }, { "epoch": 2.9299363057324843, "grad_norm": 3.236602655895111, "learning_rate": 2.6625073353884756e-08, "loss": 0.11602732539176941, "step": 231 }, { "epoch": 2.9426751592356686, "grad_norm": 3.2263437658209133, "learning_rate": 1.9565889625275945e-08, "loss": 0.12483286112546921, "step": 232 }, { "epoch": 2.9554140127388537, "grad_norm": 3.4340551157608235, "learning_rate": 1.3590133420350315e-08, "loss": 0.10575878620147705, "step": 233 }, { "epoch": 2.968152866242038, "grad_norm": 3.903227901454765, "learning_rate": 8.699104685779835e-09, "loss": 0.14583438634872437, "step": 234 }, { "epoch": 2.980891719745223, "grad_norm": 3.884552247317161, "learning_rate": 4.89386740013198e-09, "loss": 0.12648674845695496, "step": 235 }, { "epoch": 2.9936305732484074, "grad_norm": 3.2859302150161747, "learning_rate": 2.1752493424148647e-09, "loss": 0.1414915770292282, "step": 236 }, { "epoch": 3.0, "grad_norm": 2.7515804191453306, "learning_rate": 5.438419120062933e-10, "loss": 0.0598013773560524, "step": 237 }, { "epoch": 3.0, "step": 237, "total_flos": 4888319754240.0, "train_loss": 0.7582035779575759, "train_runtime": 573.8365, "train_samples_per_second": 26.14, "train_steps_per_second": 0.413 } ], "logging_steps": 1, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4888319754240.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }