{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5, "grad_norm": 38.81770706176758, "learning_rate": 0.0, "loss": 1.669, "step": 1 }, { "epoch": 1.0, "grad_norm": 42.17606735229492, "learning_rate": 8.333333333333333e-07, "loss": 1.7313, "step": 2 }, { "epoch": 1.0, "eval_loss": 1.7900009155273438, "eval_runtime": 7.5875, "eval_samples_per_second": 4.613, "eval_steps_per_second": 0.659, "step": 2 }, { "epoch": 1.5, "grad_norm": 40.34153366088867, "learning_rate": 1.6666666666666667e-06, "loss": 1.662, "step": 3 }, { "epoch": 2.0, "grad_norm": 37.466861724853516, "learning_rate": 2.5e-06, "loss": 1.6378, "step": 4 }, { "epoch": 2.0, "eval_loss": 1.5364655256271362, "eval_runtime": 7.4642, "eval_samples_per_second": 4.689, "eval_steps_per_second": 0.67, "step": 4 }, { "epoch": 2.5, "grad_norm": 29.840675354003906, "learning_rate": 3.3333333333333333e-06, "loss": 1.4225, "step": 5 }, { "epoch": 3.0, "grad_norm": 13.304512977600098, "learning_rate": 4.166666666666667e-06, "loss": 1.1356, "step": 6 }, { "epoch": 3.0, "eval_loss": 1.1185977458953857, "eval_runtime": 7.4596, "eval_samples_per_second": 4.692, "eval_steps_per_second": 0.67, "step": 6 }, { "epoch": 3.5, "grad_norm": 10.08704662322998, "learning_rate": 5e-06, "loss": 1.0871, "step": 7 }, { "epoch": 4.0, "grad_norm": 3.6056466102600098, "learning_rate": 4.995770395678171e-06, "loss": 0.9041, "step": 8 }, { "epoch": 4.0, "eval_loss": 0.9372425079345703, "eval_runtime": 7.4702, "eval_samples_per_second": 4.685, "eval_steps_per_second": 0.669, "step": 8 }, { "epoch": 4.5, "grad_norm": 3.1515934467315674, "learning_rate": 4.983095894354858e-06, "loss": 0.8773, "step": 9 }, { "epoch": 5.0, "grad_norm": 2.810807704925537, "learning_rate": 4.962019382530521e-06, "loss": 0.8762, "step": 10 }, { "epoch": 5.0, "eval_loss": 0.8529078364372253, "eval_runtime": 7.4567, "eval_samples_per_second": 4.694, "eval_steps_per_second": 0.671, "step": 10 }, { "epoch": 5.5, "grad_norm": 2.751431465148926, "learning_rate": 4.93261217644956e-06, "loss": 0.8024, "step": 11 }, { "epoch": 6.0, "grad_norm": 3.107816219329834, "learning_rate": 4.894973780788722e-06, "loss": 0.7807, "step": 12 }, { "epoch": 6.0, "eval_loss": 0.8198402523994446, "eval_runtime": 7.4664, "eval_samples_per_second": 4.688, "eval_steps_per_second": 0.67, "step": 12 }, { "epoch": 6.5, "grad_norm": 2.681008815765381, "learning_rate": 4.849231551964771e-06, "loss": 0.758, "step": 13 }, { "epoch": 7.0, "grad_norm": 2.2834079265594482, "learning_rate": 4.7955402672006855e-06, "loss": 0.7323, "step": 14 }, { "epoch": 7.0, "eval_loss": 0.7645982503890991, "eval_runtime": 7.482, "eval_samples_per_second": 4.678, "eval_steps_per_second": 0.668, "step": 14 }, { "epoch": 7.5, "grad_norm": 1.8697445392608643, "learning_rate": 4.734081600808531e-06, "loss": 0.7175, "step": 15 }, { "epoch": 8.0, "grad_norm": 1.4500072002410889, "learning_rate": 4.665063509461098e-06, "loss": 0.6814, "step": 16 }, { "epoch": 8.0, "eval_loss": 0.7229499816894531, "eval_runtime": 7.4866, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.668, "step": 16 }, { "epoch": 8.5, "grad_norm": 1.2065025568008423, "learning_rate": 4.588719528532342e-06, "loss": 0.6536, "step": 17 }, { "epoch": 9.0, "grad_norm": 1.2752524614334106, "learning_rate": 4.50530798188761e-06, "loss": 0.6211, "step": 18 }, { "epoch": 9.0, "eval_loss": 0.6847367882728577, "eval_runtime": 7.4811, "eval_samples_per_second": 4.678, "eval_steps_per_second": 0.668, "step": 18 }, { "epoch": 9.5, "grad_norm": 1.4255502223968506, "learning_rate": 4.415111107797445e-06, "loss": 0.5956, "step": 19 }, { "epoch": 10.0, "grad_norm": 1.2353219985961914, "learning_rate": 4.318434103932622e-06, "loss": 0.5738, "step": 20 }, { "epoch": 10.0, "eval_loss": 0.6638691425323486, "eval_runtime": 7.4794, "eval_samples_per_second": 4.68, "eval_steps_per_second": 0.669, "step": 20 }, { "epoch": 10.5, "grad_norm": 1.1523832082748413, "learning_rate": 4.215604094671835e-06, "loss": 0.5499, "step": 21 }, { "epoch": 11.0, "grad_norm": 1.1074179410934448, "learning_rate": 4.106969024216348e-06, "loss": 0.5171, "step": 22 }, { "epoch": 11.0, "eval_loss": 0.6498724222183228, "eval_runtime": 7.4888, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.668, "step": 22 }, { "epoch": 11.5, "grad_norm": 1.0679000616073608, "learning_rate": 3.992896479256966e-06, "loss": 0.5088, "step": 23 }, { "epoch": 12.0, "grad_norm": 1.0380491018295288, "learning_rate": 3.8737724451770155e-06, "loss": 0.4868, "step": 24 }, { "epoch": 12.0, "eval_loss": 0.6385172605514526, "eval_runtime": 7.495, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.667, "step": 24 }, { "epoch": 12.5, "grad_norm": 0.8720147609710693, "learning_rate": 3.7500000000000005e-06, "loss": 0.4697, "step": 25 }, { "epoch": 13.0, "grad_norm": 0.9460955858230591, "learning_rate": 3.621997950501156e-06, "loss": 0.4371, "step": 26 }, { "epoch": 13.0, "eval_loss": 0.6327239871025085, "eval_runtime": 7.4987, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.667, "step": 26 }, { "epoch": 13.5, "grad_norm": 0.9779807925224304, "learning_rate": 3.4901994150978926e-06, "loss": 0.4361, "step": 27 }, { "epoch": 14.0, "grad_norm": 0.9678999185562134, "learning_rate": 3.3550503583141726e-06, "loss": 0.407, "step": 28 }, { "epoch": 14.0, "eval_loss": 0.6305895447731018, "eval_runtime": 7.492, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.667, "step": 28 }, { "epoch": 14.5, "grad_norm": 0.9876528978347778, "learning_rate": 3.217008081777726e-06, "loss": 0.4015, "step": 29 }, { "epoch": 15.0, "grad_norm": 0.9497820734977722, "learning_rate": 3.0765396768561005e-06, "loss": 0.3924, "step": 30 }, { "epoch": 15.0, "eval_loss": 0.6329755187034607, "eval_runtime": 7.4793, "eval_samples_per_second": 4.68, "eval_steps_per_second": 0.669, "step": 30 }, { "epoch": 15.5, "grad_norm": 0.9186223745346069, "learning_rate": 2.9341204441673267e-06, "loss": 0.378, "step": 31 }, { "epoch": 16.0, "grad_norm": 0.9468213319778442, "learning_rate": 2.7902322853130758e-06, "loss": 0.3505, "step": 32 }, { "epoch": 16.0, "eval_loss": 0.6392822861671448, "eval_runtime": 7.4979, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.667, "step": 32 }, { "epoch": 16.5, "grad_norm": 0.9568607807159424, "learning_rate": 2.6453620722761897e-06, "loss": 0.3475, "step": 33 }, { "epoch": 17.0, "grad_norm": 1.3522801399230957, "learning_rate": 2.5e-06, "loss": 0.3339, "step": 34 }, { "epoch": 17.0, "eval_loss": 0.6492648124694824, "eval_runtime": 7.479, "eval_samples_per_second": 4.68, "eval_steps_per_second": 0.669, "step": 34 }, { "epoch": 17.5, "grad_norm": 0.9427582025527954, "learning_rate": 2.3546379277238107e-06, "loss": 0.3202, "step": 35 }, { "epoch": 18.0, "grad_norm": 1.018237829208374, "learning_rate": 2.2097677146869242e-06, "loss": 0.3086, "step": 36 }, { "epoch": 18.0, "eval_loss": 0.6622989773750305, "eval_runtime": 7.4939, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.667, "step": 36 }, { "epoch": 18.5, "grad_norm": 0.9453594088554382, "learning_rate": 2.0658795558326745e-06, "loss": 0.3004, "step": 37 }, { "epoch": 19.0, "grad_norm": 1.172818899154663, "learning_rate": 1.9234603231439e-06, "loss": 0.2803, "step": 38 }, { "epoch": 19.0, "eval_loss": 0.6748006343841553, "eval_runtime": 7.4995, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.667, "step": 38 }, { "epoch": 19.5, "grad_norm": 1.2339236736297607, "learning_rate": 1.7829919182222752e-06, "loss": 0.2751, "step": 39 }, { "epoch": 20.0, "grad_norm": 1.0225498676300049, "learning_rate": 1.6449496416858285e-06, "loss": 0.2687, "step": 40 }, { "epoch": 20.0, "eval_loss": 0.6873091459274292, "eval_runtime": 7.4881, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.668, "step": 40 }, { "epoch": 20.5, "grad_norm": 0.9979678392410278, "learning_rate": 1.509800584902108e-06, "loss": 0.2556, "step": 41 }, { "epoch": 21.0, "grad_norm": 0.9665634632110596, "learning_rate": 1.3780020494988447e-06, "loss": 0.25, "step": 42 }, { "epoch": 21.0, "eval_loss": 0.6983169913291931, "eval_runtime": 7.4891, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.668, "step": 42 }, { "epoch": 21.5, "grad_norm": 1.0296282768249512, "learning_rate": 1.2500000000000007e-06, "loss": 0.238, "step": 43 }, { "epoch": 22.0, "grad_norm": 1.0459901094436646, "learning_rate": 1.1262275548229852e-06, "loss": 0.2306, "step": 44 }, { "epoch": 22.0, "eval_loss": 0.7099719047546387, "eval_runtime": 7.5003, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.667, "step": 44 }, { "epoch": 22.5, "grad_norm": 1.0921026468276978, "learning_rate": 1.0071035207430352e-06, "loss": 0.227, "step": 45 }, { "epoch": 23.0, "grad_norm": 0.9770745635032654, "learning_rate": 8.930309757836517e-07, "loss": 0.2168, "step": 46 }, { "epoch": 23.0, "eval_loss": 0.720533549785614, "eval_runtime": 7.4879, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.668, "step": 46 }, { "epoch": 23.5, "grad_norm": 1.0788757801055908, "learning_rate": 7.843959053281663e-07, "loss": 0.2105, "step": 47 }, { "epoch": 24.0, "grad_norm": 1.103110432624817, "learning_rate": 6.815658960673782e-07, "loss": 0.2125, "step": 48 }, { "epoch": 24.0, "eval_loss": 0.7300633192062378, "eval_runtime": 7.479, "eval_samples_per_second": 4.68, "eval_steps_per_second": 0.669, "step": 48 }, { "epoch": 24.5, "grad_norm": 0.9235285520553589, "learning_rate": 5.848888922025553e-07, "loss": 0.2052, "step": 49 }, { "epoch": 25.0, "grad_norm": 1.4067970514297485, "learning_rate": 4.946920181123904e-07, "loss": 0.2031, "step": 50 }, { "epoch": 25.0, "eval_loss": 0.7378360629081726, "eval_runtime": 7.5148, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.665, "step": 50 }, { "epoch": 25.5, "grad_norm": 0.960370659828186, "learning_rate": 4.1128047146765936e-07, "loss": 0.2048, "step": 51 }, { "epoch": 26.0, "grad_norm": 1.2136261463165283, "learning_rate": 3.3493649053890325e-07, "loss": 0.1975, "step": 52 }, { "epoch": 26.0, "eval_loss": 0.7432867288589478, "eval_runtime": 7.4969, "eval_samples_per_second": 4.669, "eval_steps_per_second": 0.667, "step": 52 }, { "epoch": 26.5, "grad_norm": 1.0613588094711304, "learning_rate": 2.6591839919146963e-07, "loss": 0.1925, "step": 53 }, { "epoch": 27.0, "grad_norm": 1.117826223373413, "learning_rate": 2.044597327993153e-07, "loss": 0.2001, "step": 54 }, { "epoch": 27.0, "eval_loss": 0.7474139928817749, "eval_runtime": 7.4824, "eval_samples_per_second": 4.678, "eval_steps_per_second": 0.668, "step": 54 }, { "epoch": 27.5, "grad_norm": 1.350067138671875, "learning_rate": 1.507684480352292e-07, "loss": 0.1942, "step": 55 }, { "epoch": 28.0, "grad_norm": 0.8971886038780212, "learning_rate": 1.0502621921127776e-07, "loss": 0.1953, "step": 56 }, { "epoch": 28.0, "eval_loss": 0.7486553192138672, "eval_runtime": 7.4958, "eval_samples_per_second": 4.669, "eval_steps_per_second": 0.667, "step": 56 }, { "epoch": 28.5, "grad_norm": 0.9421606063842773, "learning_rate": 6.738782355044048e-08, "loss": 0.1883, "step": 57 }, { "epoch": 29.0, "grad_norm": 1.1213371753692627, "learning_rate": 3.798061746947995e-08, "loss": 0.1895, "step": 58 }, { "epoch": 29.0, "eval_loss": 0.7486764788627625, "eval_runtime": 7.4875, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.668, "step": 58 }, { "epoch": 29.5, "grad_norm": 1.0604745149612427, "learning_rate": 1.6904105645142443e-08, "loss": 0.1886, "step": 59 }, { "epoch": 30.0, "grad_norm": 1.0217480659484863, "learning_rate": 4.229604321829561e-09, "loss": 0.1976, "step": 60 }, { "epoch": 30.0, "eval_loss": 0.7485920190811157, "eval_runtime": 7.2053, "eval_samples_per_second": 4.858, "eval_steps_per_second": 0.694, "step": 60 }, { "epoch": 30.0, "step": 60, "total_flos": 1.0166485230865613e+18, "train_loss": 0.5314400238295396, "train_runtime": 1496.8474, "train_samples_per_second": 1.022, "train_steps_per_second": 0.04 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 2000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0166485230865613e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }