{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.691509962081909, "learning_rate": 0.00015998457923856519, "loss": 1.4456, "step": 1 }, { "epoch": 0.025, "grad_norm": 24.26286506652832, "learning_rate": 0.00015993832289925785, "loss": 4.3349, "step": 2 }, { "epoch": 0.0375, "grad_norm": 13.163036346435547, "learning_rate": 0.0001598612488147773, "loss": 2.6695, "step": 3 }, { "epoch": 0.05, "grad_norm": 9.818785667419434, "learning_rate": 0.00015975338669865026, "loss": 2.3799, "step": 4 }, { "epoch": 0.0625, "grad_norm": 6.200242519378662, "learning_rate": 0.00015961477813377576, "loss": 2.0935, "step": 5 }, { "epoch": 0.075, "grad_norm": 2.0556814670562744, "learning_rate": 0.00015944547655639412, "loss": 1.8465, "step": 6 }, { "epoch": 0.0875, "grad_norm": 2.5195746421813965, "learning_rate": 0.00015924554723548617, "loss": 1.7321, "step": 7 }, { "epoch": 0.1, "grad_norm": 4.300451278686523, "learning_rate": 0.00015901506724761103, "loss": 1.7284, "step": 8 }, { "epoch": 0.1125, "grad_norm": 1.5021892786026, "learning_rate": 0.00015875412544719134, "loss": 1.5971, "step": 9 }, { "epoch": 0.125, "grad_norm": 1.5246820449829102, "learning_rate": 0.00015846282243225845, "loss": 1.562, "step": 10 }, { "epoch": 0.1375, "grad_norm": 2.0095787048339844, "learning_rate": 0.0001581412705056698, "loss": 1.578, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.9773982763290405, "learning_rate": 0.00015778959363181415, "loss": 1.4977, "step": 12 }, { "epoch": 0.1625, "grad_norm": 1.1493251323699951, "learning_rate": 0.0001574079273888208, "loss": 1.5075, "step": 13 }, { "epoch": 0.175, "grad_norm": 0.8909309506416321, "learning_rate": 0.00015699641891629178, "loss": 1.4158, "step": 14 }, { "epoch": 0.1875, "grad_norm": 0.9415439963340759, "learning_rate": 0.00015655522685857672, "loss": 1.4219, "step": 15 }, { "epoch": 0.2, "grad_norm": 1.1703603267669678, "learning_rate": 0.0001560845213036123, "loss": 1.4006, "step": 16 }, { "epoch": 0.2125, "grad_norm": 0.7575011849403381, "learning_rate": 0.00015558448371735025, "loss": 1.3675, "step": 17 }, { "epoch": 0.225, "grad_norm": 0.6772542595863342, "learning_rate": 0.00015505530687379875, "loss": 1.3369, "step": 18 }, { "epoch": 0.2375, "grad_norm": 0.5587411522865295, "learning_rate": 0.00015449719478070428, "loss": 1.3632, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.5920618772506714, "learning_rate": 0.00015391036260090294, "loss": 1.3511, "step": 20 }, { "epoch": 0.2625, "grad_norm": 0.4218953847885132, "learning_rate": 0.0001532950365693709, "loss": 1.3641, "step": 21 }, { "epoch": 0.275, "grad_norm": 0.4676741361618042, "learning_rate": 0.00015265145390600652, "loss": 1.3441, "step": 22 }, { "epoch": 0.2875, "grad_norm": 0.38095250725746155, "learning_rate": 0.00015197986272417774, "loss": 1.3418, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.42308753728866577, "learning_rate": 0.00015128052193506944, "loss": 1.3646, "step": 24 }, { "epoch": 0.3125, "grad_norm": 0.4307089149951935, "learning_rate": 0.0001505537011478684, "loss": 1.2992, "step": 25 }, { "epoch": 0.325, "grad_norm": 0.33103814721107483, "learning_rate": 0.0001497996805658238, "loss": 1.3435, "step": 26 }, { "epoch": 0.3375, "grad_norm": 0.3511773645877838, "learning_rate": 0.00014901875087822337, "loss": 1.3, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.2914850115776062, "learning_rate": 0.0001482112131483274, "loss": 1.3103, "step": 28 }, { "epoch": 0.3625, "grad_norm": 0.37050625681877136, "learning_rate": 0.00014737737869730292, "loss": 1.2731, "step": 29 }, { "epoch": 0.375, "grad_norm": 0.3476356565952301, "learning_rate": 0.00014651756898420365, "loss": 1.3211, "step": 30 }, { "epoch": 0.3875, "grad_norm": 0.27799472212791443, "learning_rate": 0.0001456321154820411, "loss": 1.2657, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.318327397108078, "learning_rate": 0.00014472135954999581, "loss": 1.3068, "step": 32 }, { "epoch": 0.4125, "grad_norm": 0.30465707182884216, "learning_rate": 0.00014378565230181657, "loss": 1.2839, "step": 33 }, { "epoch": 0.425, "grad_norm": 0.2618834376335144, "learning_rate": 0.0001428253544704596, "loss": 1.2868, "step": 34 }, { "epoch": 0.4375, "grad_norm": 0.2864656150341034, "learning_rate": 0.00014184083626901897, "loss": 1.2815, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.2776831388473511, "learning_rate": 0.0001408324772480025, "loss": 1.2895, "step": 36 }, { "epoch": 0.4625, "grad_norm": 0.31238630414009094, "learning_rate": 0.00013980066614900776, "loss": 1.2718, "step": 37 }, { "epoch": 0.475, "grad_norm": 0.23365426063537598, "learning_rate": 0.00013874580075485485, "loss": 1.2596, "step": 38 }, { "epoch": 0.4875, "grad_norm": 0.23924365639686584, "learning_rate": 0.00013766828773623352, "loss": 1.2809, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.24298632144927979, "learning_rate": 0.00013656854249492382, "loss": 1.2248, "step": 40 }, { "epoch": 0.5125, "grad_norm": 0.25117772817611694, "learning_rate": 0.0001354469890036509, "loss": 1.2653, "step": 41 }, { "epoch": 0.525, "grad_norm": 0.25377020239830017, "learning_rate": 0.00013430405964263536, "loss": 1.2687, "step": 42 }, { "epoch": 0.5375, "grad_norm": 0.24669994413852692, "learning_rate": 0.00013314019503290255, "loss": 1.269, "step": 43 }, { "epoch": 0.55, "grad_norm": 0.22006134688854218, "learning_rate": 0.00013195584386641469, "loss": 1.2559, "step": 44 }, { "epoch": 0.5625, "grad_norm": 0.2517986595630646, "learning_rate": 0.00013075146273309164, "loss": 1.2477, "step": 45 }, { "epoch": 0.575, "grad_norm": 0.21466796100139618, "learning_rate": 0.00012952751594478675, "loss": 1.2358, "step": 46 }, { "epoch": 0.5875, "grad_norm": 0.2188994437456131, "learning_rate": 0.0001282844753562857, "loss": 1.2444, "step": 47 }, { "epoch": 0.6, "grad_norm": 2.4198501110076904, "learning_rate": 0.00012702282018339786, "loss": 1.2535, "step": 48 }, { "epoch": 0.6125, "grad_norm": 0.3393913209438324, "learning_rate": 0.00012574303681820898, "loss": 1.2361, "step": 49 }, { "epoch": 0.625, "grad_norm": 0.32384437322616577, "learning_rate": 0.0001244456186415682, "loss": 1.2283, "step": 50 }, { "epoch": 0.6375, "grad_norm": 0.3469082713127136, "learning_rate": 0.00012313106583288004, "loss": 1.2401, "step": 51 }, { "epoch": 0.65, "grad_norm": 0.42606261372566223, "learning_rate": 0.00012179988517727591, "loss": 1.2399, "step": 52 }, { "epoch": 0.6625, "grad_norm": 0.4077642261981964, "learning_rate": 0.00012045258987023879, "loss": 1.2441, "step": 53 }, { "epoch": 0.675, "grad_norm": 0.3077225089073181, "learning_rate": 0.00011908969931975641, "loss": 1.253, "step": 54 }, { "epoch": 0.6875, "grad_norm": 0.9925752878189087, "learning_rate": 0.00011771173894607985, "loss": 1.2586, "step": 55 }, { "epoch": 0.7, "grad_norm": 1.9072725772857666, "learning_rate": 0.00011631923997916375, "loss": 1.2643, "step": 56 }, { "epoch": 0.7125, "grad_norm": 0.5788567662239075, "learning_rate": 0.00011491273925386736, "loss": 1.2657, "step": 57 }, { "epoch": 0.725, "grad_norm": 0.9417564868927002, "learning_rate": 0.00011349277900299426, "loss": 1.2526, "step": 58 }, { "epoch": 0.7375, "grad_norm": 0.9247767329216003, "learning_rate": 0.00011205990664825127, "loss": 1.2402, "step": 59 }, { "epoch": 0.75, "grad_norm": 0.5092797875404358, "learning_rate": 0.00011061467458920719, "loss": 1.2264, "step": 60 }, { "epoch": 0.7625, "grad_norm": 0.8128093481063843, "learning_rate": 0.00010915763999033201, "loss": 1.22, "step": 61 }, { "epoch": 0.775, "grad_norm": 0.4954143166542053, "learning_rate": 0.00010768936456619945, "loss": 1.203, "step": 62 }, { "epoch": 0.7875, "grad_norm": 0.7117099761962891, "learning_rate": 0.0001062104143649355, "loss": 1.2295, "step": 63 }, { "epoch": 0.8, "grad_norm": 0.5060359835624695, "learning_rate": 0.0001047213595499958, "loss": 1.1936, "step": 64 }, { "epoch": 0.8125, "grad_norm": 0.5212268829345703, "learning_rate": 0.000103222774180357, "loss": 1.1927, "step": 65 }, { "epoch": 0.825, "grad_norm": 0.47975900769233704, "learning_rate": 0.00010171523598920594, "loss": 1.2116, "step": 66 }, { "epoch": 0.8375, "grad_norm": 0.3655720055103302, "learning_rate": 0.00010019932616121264, "loss": 1.2002, "step": 67 }, { "epoch": 0.85, "grad_norm": 0.38993576169013977, "learning_rate": 9.867562910847246e-05, "loss": 1.2225, "step": 68 }, { "epoch": 0.8625, "grad_norm": 0.33190780878067017, "learning_rate": 9.714473224520406e-05, "loss": 1.1982, "step": 69 }, { "epoch": 0.875, "grad_norm": 0.3178853988647461, "learning_rate": 9.560722576129029e-05, "loss": 1.2015, "step": 70 }, { "epoch": 0.8875, "grad_norm": 0.28483396768569946, "learning_rate": 9.406370239474839e-05, "loss": 1.2013, "step": 71 }, { "epoch": 0.9, "grad_norm": 0.26456528902053833, "learning_rate": 9.251475720321848e-05, "loss": 1.2101, "step": 72 }, { "epoch": 0.9125, "grad_norm": 0.24198457598686218, "learning_rate": 9.096098733455746e-05, "loss": 1.1889, "step": 73 }, { "epoch": 0.925, "grad_norm": 0.2521977424621582, "learning_rate": 8.940299179662703e-05, "loss": 1.1915, "step": 74 }, { "epoch": 0.9375, "grad_norm": 0.22842273116111755, "learning_rate": 8.784137122636488e-05, "loss": 1.2018, "step": 75 }, { "epoch": 0.95, "grad_norm": 0.21817852556705475, "learning_rate": 8.627672765822762e-05, "loss": 1.188, "step": 76 }, { "epoch": 0.9625, "grad_norm": 0.1990320086479187, "learning_rate": 8.470966429209512e-05, "loss": 1.1821, "step": 77 }, { "epoch": 0.975, "grad_norm": 0.20685255527496338, "learning_rate": 8.31407852607255e-05, "loss": 1.1687, "step": 78 }, { "epoch": 0.9875, "grad_norm": 0.20527754724025726, "learning_rate": 8.157069539685026e-05, "loss": 1.2024, "step": 79 }, { "epoch": 1.0, "grad_norm": 0.20712077617645264, "learning_rate": 8e-05, "loss": 1.1721, "step": 80 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6380640896784794e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }