{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 2.8039450645446777, "learning_rate": 6.125e-05, "loss": 8.1001, "step": 50 }, { "epoch": 0.4, "grad_norm": 1.782693862915039, "learning_rate": 0.00012375, "loss": 6.0373, "step": 100 }, { "epoch": 0.6, "grad_norm": 1.6799567937850952, "learning_rate": 0.00018625, "loss": 5.4579, "step": 150 }, { "epoch": 0.8, "grad_norm": 1.5503798723220825, "learning_rate": 0.00024875, "loss": 5.237, "step": 200 }, { "epoch": 1.0, "grad_norm": 1.5336804389953613, "learning_rate": 0.00024788793103448277, "loss": 5.0786, "step": 250 }, { "epoch": 1.2, "grad_norm": 1.5419166088104248, "learning_rate": 0.0002457327586206897, "loss": 4.6644, "step": 300 }, { "epoch": 1.4, "grad_norm": 1.5083167552947998, "learning_rate": 0.00024357758620689656, "loss": 4.5686, "step": 350 }, { "epoch": 1.6, "grad_norm": 1.5312546491622925, "learning_rate": 0.00024142241379310344, "loss": 4.4955, "step": 400 }, { "epoch": 1.8, "grad_norm": 1.4562275409698486, "learning_rate": 0.00023926724137931035, "loss": 4.4404, "step": 450 }, { "epoch": 2.0, "grad_norm": 1.547753930091858, "learning_rate": 0.00023711206896551723, "loss": 4.379, "step": 500 }, { "epoch": 2.2, "grad_norm": 1.611249566078186, "learning_rate": 0.00023495689655172414, "loss": 4.01, "step": 550 }, { "epoch": 2.4, "grad_norm": 1.383882999420166, "learning_rate": 0.00023280172413793105, "loss": 3.9787, "step": 600 }, { "epoch": 2.6, "grad_norm": 1.521315574645996, "learning_rate": 0.00023064655172413793, "loss": 3.9931, "step": 650 }, { "epoch": 2.8, "grad_norm": 1.4775588512420654, "learning_rate": 0.00022849137931034484, "loss": 3.915, "step": 700 }, { "epoch": 3.0, "grad_norm": 1.5033624172210693, "learning_rate": 0.00022633620689655173, "loss": 3.9011, "step": 750 }, { "epoch": 3.2, "grad_norm": 1.6199653148651123, "learning_rate": 0.0002241810344827586, "loss": 3.5909, "step": 800 }, { "epoch": 3.4, "grad_norm": 1.4687559604644775, "learning_rate": 0.00022202586206896552, "loss": 3.6409, "step": 850 }, { "epoch": 3.6, "grad_norm": 1.4177491664886475, "learning_rate": 0.00021987068965517243, "loss": 3.5883, "step": 900 }, { "epoch": 3.8, "grad_norm": 1.395131230354309, "learning_rate": 0.0002177155172413793, "loss": 3.5468, "step": 950 }, { "epoch": 4.0, "grad_norm": 1.4466007947921753, "learning_rate": 0.00021556034482758622, "loss": 3.5746, "step": 1000 }, { "epoch": 4.2, "grad_norm": 1.3139097690582275, "learning_rate": 0.00021340517241379313, "loss": 3.3073, "step": 1050 }, { "epoch": 4.4, "grad_norm": 1.5377048254013062, "learning_rate": 0.00021124999999999998, "loss": 3.3136, "step": 1100 }, { "epoch": 4.6, "grad_norm": 1.4794890880584717, "learning_rate": 0.0002090948275862069, "loss": 3.287, "step": 1150 }, { "epoch": 4.8, "grad_norm": 1.458489179611206, "learning_rate": 0.0002069396551724138, "loss": 3.3283, "step": 1200 }, { "epoch": 5.0, "grad_norm": 1.5431376695632935, "learning_rate": 0.0002047844827586207, "loss": 3.2917, "step": 1250 }, { "epoch": 5.2, "grad_norm": 1.27516770362854, "learning_rate": 0.0002026293103448276, "loss": 3.0597, "step": 1300 }, { "epoch": 5.4, "grad_norm": 1.572311282157898, "learning_rate": 0.0002004741379310345, "loss": 3.0982, "step": 1350 }, { "epoch": 5.6, "grad_norm": 1.4160852432250977, "learning_rate": 0.0001983189655172414, "loss": 3.0492, "step": 1400 }, { "epoch": 5.8, "grad_norm": 1.5755418539047241, "learning_rate": 0.0001961637931034483, "loss": 3.0756, "step": 1450 }, { "epoch": 6.0, "grad_norm": 1.5663151741027832, "learning_rate": 0.00019400862068965515, "loss": 3.1045, "step": 1500 }, { "epoch": 6.2, "grad_norm": 1.4873141050338745, "learning_rate": 0.00019185344827586206, "loss": 2.857, "step": 1550 }, { "epoch": 6.4, "grad_norm": 1.4372650384902954, "learning_rate": 0.00018969827586206897, "loss": 2.8887, "step": 1600 }, { "epoch": 6.6, "grad_norm": 1.6041676998138428, "learning_rate": 0.00018754310344827585, "loss": 2.8847, "step": 1650 }, { "epoch": 6.8, "grad_norm": 1.597738265991211, "learning_rate": 0.00018538793103448276, "loss": 2.9167, "step": 1700 }, { "epoch": 7.0, "grad_norm": 1.4618918895721436, "learning_rate": 0.00018323275862068967, "loss": 2.9025, "step": 1750 }, { "epoch": 7.2, "grad_norm": 1.4208807945251465, "learning_rate": 0.00018107758620689656, "loss": 2.6906, "step": 1800 }, { "epoch": 7.4, "grad_norm": 1.3772602081298828, "learning_rate": 0.00017892241379310344, "loss": 2.7045, "step": 1850 }, { "epoch": 7.6, "grad_norm": 1.3876913785934448, "learning_rate": 0.00017676724137931035, "loss": 2.7185, "step": 1900 }, { "epoch": 7.8, "grad_norm": 1.6142256259918213, "learning_rate": 0.00017461206896551723, "loss": 2.7453, "step": 1950 }, { "epoch": 8.0, "grad_norm": 1.7862074375152588, "learning_rate": 0.00017245689655172414, "loss": 2.7847, "step": 2000 }, { "epoch": 8.2, "grad_norm": 1.520495057106018, "learning_rate": 0.00017030172413793105, "loss": 2.5499, "step": 2050 }, { "epoch": 8.4, "grad_norm": 1.5806176662445068, "learning_rate": 0.00016814655172413793, "loss": 2.5625, "step": 2100 }, { "epoch": 8.6, "grad_norm": 1.4896948337554932, "learning_rate": 0.00016599137931034484, "loss": 2.576, "step": 2150 }, { "epoch": 8.8, "grad_norm": 1.5307574272155762, "learning_rate": 0.00016383620689655172, "loss": 2.6138, "step": 2200 }, { "epoch": 9.0, "grad_norm": 1.5115731954574585, "learning_rate": 0.0001616810344827586, "loss": 2.6294, "step": 2250 }, { "epoch": 9.2, "grad_norm": 1.5139647722244263, "learning_rate": 0.00015952586206896552, "loss": 2.4292, "step": 2300 }, { "epoch": 9.4, "grad_norm": 1.4001915454864502, "learning_rate": 0.00015737068965517243, "loss": 2.444, "step": 2350 }, { "epoch": 9.6, "grad_norm": 1.3104708194732666, "learning_rate": 0.0001552155172413793, "loss": 2.4509, "step": 2400 }, { "epoch": 9.8, "grad_norm": 1.6207849979400635, "learning_rate": 0.00015306034482758622, "loss": 2.4975, "step": 2450 }, { "epoch": 10.0, "grad_norm": 1.5676051378250122, "learning_rate": 0.00015090517241379313, "loss": 2.4849, "step": 2500 }, { "epoch": 10.2, "grad_norm": 1.6440703868865967, "learning_rate": 0.00014874999999999998, "loss": 2.2924, "step": 2550 }, { "epoch": 10.4, "grad_norm": 1.434339165687561, "learning_rate": 0.0001465948275862069, "loss": 2.3264, "step": 2600 }, { "epoch": 10.6, "grad_norm": 1.5489099025726318, "learning_rate": 0.0001444396551724138, "loss": 2.3671, "step": 2650 }, { "epoch": 10.8, "grad_norm": 1.3774558305740356, "learning_rate": 0.00014228448275862069, "loss": 2.3834, "step": 2700 }, { "epoch": 11.0, "grad_norm": 1.4112049341201782, "learning_rate": 0.0001401293103448276, "loss": 2.386, "step": 2750 }, { "epoch": 11.2, "grad_norm": 1.637389898300171, "learning_rate": 0.0001379741379310345, "loss": 2.1944, "step": 2800 }, { "epoch": 11.4, "grad_norm": 1.717990756034851, "learning_rate": 0.0001358189655172414, "loss": 2.2413, "step": 2850 }, { "epoch": 11.6, "grad_norm": 1.5509297847747803, "learning_rate": 0.0001336637931034483, "loss": 2.2519, "step": 2900 }, { "epoch": 11.8, "grad_norm": 1.3987082242965698, "learning_rate": 0.00013150862068965515, "loss": 2.2691, "step": 2950 }, { "epoch": 12.0, "grad_norm": 1.4241447448730469, "learning_rate": 0.00012935344827586206, "loss": 2.2907, "step": 3000 }, { "epoch": 12.2, "grad_norm": 1.4504032135009766, "learning_rate": 0.00012719827586206897, "loss": 2.1044, "step": 3050 }, { "epoch": 12.4, "grad_norm": 1.6953134536743164, "learning_rate": 0.00012504310344827585, "loss": 2.1447, "step": 3100 }, { "epoch": 12.6, "grad_norm": 1.6625018119812012, "learning_rate": 0.00012288793103448276, "loss": 2.1829, "step": 3150 }, { "epoch": 12.8, "grad_norm": 1.5335619449615479, "learning_rate": 0.00012073275862068966, "loss": 2.2124, "step": 3200 }, { "epoch": 13.0, "grad_norm": 1.4856654405593872, "learning_rate": 0.00011857758620689656, "loss": 2.1592, "step": 3250 }, { "epoch": 13.2, "grad_norm": 1.517542839050293, "learning_rate": 0.00011642241379310345, "loss": 2.0258, "step": 3300 }, { "epoch": 13.4, "grad_norm": 1.494611382484436, "learning_rate": 0.00011426724137931035, "loss": 2.0496, "step": 3350 }, { "epoch": 13.6, "grad_norm": 1.5751359462738037, "learning_rate": 0.00011211206896551724, "loss": 2.0851, "step": 3400 }, { "epoch": 13.8, "grad_norm": 1.5572459697723389, "learning_rate": 0.00010995689655172414, "loss": 2.1255, "step": 3450 }, { "epoch": 14.0, "grad_norm": 1.5155609846115112, "learning_rate": 0.00010780172413793104, "loss": 2.0991, "step": 3500 }, { "epoch": 14.2, "grad_norm": 1.48418128490448, "learning_rate": 0.00010564655172413793, "loss": 1.9705, "step": 3550 }, { "epoch": 14.4, "grad_norm": 1.5270135402679443, "learning_rate": 0.00010349137931034483, "loss": 2.0004, "step": 3600 }, { "epoch": 14.6, "grad_norm": 1.5730317831039429, "learning_rate": 0.00010133620689655172, "loss": 2.0317, "step": 3650 }, { "epoch": 14.8, "grad_norm": 1.5356438159942627, "learning_rate": 9.918103448275863e-05, "loss": 1.9999, "step": 3700 }, { "epoch": 15.0, "grad_norm": 1.6630584001541138, "learning_rate": 9.702586206896552e-05, "loss": 2.0269, "step": 3750 }, { "epoch": 15.2, "grad_norm": 1.5010318756103516, "learning_rate": 9.487068965517241e-05, "loss": 1.9149, "step": 3800 }, { "epoch": 15.4, "grad_norm": 1.5633668899536133, "learning_rate": 9.271551724137932e-05, "loss": 1.9338, "step": 3850 }, { "epoch": 15.6, "grad_norm": 1.59940767288208, "learning_rate": 9.056034482758622e-05, "loss": 1.946, "step": 3900 }, { "epoch": 15.8, "grad_norm": 1.5850439071655273, "learning_rate": 8.84051724137931e-05, "loss": 1.9543, "step": 3950 }, { "epoch": 16.0, "grad_norm": 1.5602020025253296, "learning_rate": 8.625e-05, "loss": 1.9461, "step": 4000 }, { "epoch": 16.2, "grad_norm": 1.679661750793457, "learning_rate": 8.40948275862069e-05, "loss": 1.8593, "step": 4050 }, { "epoch": 16.4, "grad_norm": 1.5591189861297607, "learning_rate": 8.193965517241379e-05, "loss": 1.8601, "step": 4100 }, { "epoch": 16.6, "grad_norm": 1.426254153251648, "learning_rate": 7.978448275862068e-05, "loss": 1.9044, "step": 4150 }, { "epoch": 16.8, "grad_norm": 1.553831696510315, "learning_rate": 7.76293103448276e-05, "loss": 1.8799, "step": 4200 }, { "epoch": 17.0, "grad_norm": 1.7395073175430298, "learning_rate": 7.547413793103449e-05, "loss": 1.9057, "step": 4250 }, { "epoch": 17.2, "grad_norm": 1.6055529117584229, "learning_rate": 7.331896551724137e-05, "loss": 1.7787, "step": 4300 }, { "epoch": 17.4, "grad_norm": 1.6556615829467773, "learning_rate": 7.116379310344828e-05, "loss": 1.7999, "step": 4350 }, { "epoch": 17.6, "grad_norm": 1.632834553718567, "learning_rate": 6.900862068965518e-05, "loss": 1.8312, "step": 4400 }, { "epoch": 17.8, "grad_norm": 1.749241828918457, "learning_rate": 6.685344827586206e-05, "loss": 1.8596, "step": 4450 }, { "epoch": 18.0, "grad_norm": 1.4276556968688965, "learning_rate": 6.469827586206897e-05, "loss": 1.8669, "step": 4500 }, { "epoch": 18.2, "grad_norm": 1.562339425086975, "learning_rate": 6.254310344827587e-05, "loss": 1.755, "step": 4550 }, { "epoch": 18.4, "grad_norm": 1.4620212316513062, "learning_rate": 6.0387931034482755e-05, "loss": 1.7945, "step": 4600 }, { "epoch": 18.6, "grad_norm": 1.601141095161438, "learning_rate": 5.823275862068966e-05, "loss": 1.7674, "step": 4650 }, { "epoch": 18.8, "grad_norm": 1.6034783124923706, "learning_rate": 5.6077586206896554e-05, "loss": 1.7728, "step": 4700 }, { "epoch": 19.0, "grad_norm": 1.545716643333435, "learning_rate": 5.392241379310345e-05, "loss": 1.8167, "step": 4750 }, { "epoch": 19.2, "grad_norm": 1.442853569984436, "learning_rate": 5.1767241379310346e-05, "loss": 1.7204, "step": 4800 }, { "epoch": 19.4, "grad_norm": 1.524326205253601, "learning_rate": 4.961206896551725e-05, "loss": 1.7265, "step": 4850 }, { "epoch": 19.6, "grad_norm": 1.6314244270324707, "learning_rate": 4.745689655172414e-05, "loss": 1.7385, "step": 4900 }, { "epoch": 19.8, "grad_norm": 1.5261213779449463, "learning_rate": 4.5301724137931034e-05, "loss": 1.7385, "step": 4950 }, { "epoch": 20.0, "grad_norm": 1.6839487552642822, "learning_rate": 4.314655172413793e-05, "loss": 1.7663, "step": 5000 } ], "logging_steps": 50, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 24, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5233484759040000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }