{ "best_global_step": 334, "best_metric": 1.941367506980896, "best_model_checkpoint": "./Summary-0.1/checkpoint-334", "epoch": 3.0, "eval_steps": 500, "global_step": 501, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059880239520958084, "grad_norm": 38.75, "learning_rate": 4.5e-06, "loss": 2.7202, "mean_token_accuracy": 0.45009988248348237, "num_tokens": 29469.0, "step": 10 }, { "epoch": 0.11976047904191617, "grad_norm": 9.25, "learning_rate": 9.5e-06, "loss": 2.2626, "mean_token_accuracy": 0.5219378590583801, "num_tokens": 58579.0, "step": 20 }, { "epoch": 0.17964071856287425, "grad_norm": 6.375, "learning_rate": 1.45e-05, "loss": 2.2181, "mean_token_accuracy": 0.518680065870285, "num_tokens": 88193.0, "step": 30 }, { "epoch": 0.23952095808383234, "grad_norm": 5.9375, "learning_rate": 1.9500000000000003e-05, "loss": 2.0667, "mean_token_accuracy": 0.5430938720703125, "num_tokens": 116933.0, "step": 40 }, { "epoch": 0.2994011976047904, "grad_norm": 5.65625, "learning_rate": 2.45e-05, "loss": 2.1906, "mean_token_accuracy": 0.5175440430641174, "num_tokens": 146513.0, "step": 50 }, { "epoch": 0.3592814371257485, "grad_norm": 5.9375, "learning_rate": 2.95e-05, "loss": 2.092, "mean_token_accuracy": 0.5363078862428665, "num_tokens": 175698.0, "step": 60 }, { "epoch": 0.41916167664670656, "grad_norm": 4.9375, "learning_rate": 3.45e-05, "loss": 1.9358, "mean_token_accuracy": 0.568104338645935, "num_tokens": 202607.0, "step": 70 }, { "epoch": 0.47904191616766467, "grad_norm": 5.1875, "learning_rate": 3.9500000000000005e-05, "loss": 2.0201, "mean_token_accuracy": 0.550568813085556, "num_tokens": 231199.0, "step": 80 }, { "epoch": 0.5389221556886228, "grad_norm": 5.03125, "learning_rate": 4.4500000000000004e-05, "loss": 1.9076, "mean_token_accuracy": 0.5696590662002563, "num_tokens": 260064.0, "step": 90 }, { "epoch": 0.5988023952095808, "grad_norm": 5.03125, "learning_rate": 4.9500000000000004e-05, "loss": 1.9292, "mean_token_accuracy": 0.5686947405338287, "num_tokens": 288635.0, "step": 100 }, { "epoch": 0.6586826347305389, "grad_norm": 5.40625, "learning_rate": 4.8076923076923084e-05, "loss": 2.0754, "mean_token_accuracy": 0.5396111845970154, "num_tokens": 315994.0, "step": 110 }, { "epoch": 0.718562874251497, "grad_norm": 4.78125, "learning_rate": 4.594017094017094e-05, "loss": 1.942, "mean_token_accuracy": 0.567721825838089, "num_tokens": 345852.0, "step": 120 }, { "epoch": 0.7784431137724551, "grad_norm": 4.84375, "learning_rate": 4.3803418803418805e-05, "loss": 2.0083, "mean_token_accuracy": 0.5502024054527282, "num_tokens": 375046.0, "step": 130 }, { "epoch": 0.8383233532934131, "grad_norm": 4.59375, "learning_rate": 4.166666666666667e-05, "loss": 1.9348, "mean_token_accuracy": 0.563525739312172, "num_tokens": 403950.0, "step": 140 }, { "epoch": 0.8982035928143712, "grad_norm": 4.90625, "learning_rate": 3.952991452991453e-05, "loss": 1.8796, "mean_token_accuracy": 0.5778123795986175, "num_tokens": 433871.0, "step": 150 }, { "epoch": 0.9580838323353293, "grad_norm": 4.375, "learning_rate": 3.739316239316239e-05, "loss": 2.0021, "mean_token_accuracy": 0.5490101099014282, "num_tokens": 463543.0, "step": 160 }, { "epoch": 1.0, "eval_loss": 1.9482988119125366, "eval_mean_token_accuracy": 0.5604007748457102, "eval_num_tokens": 482617.0, "eval_runtime": 39.6692, "eval_samples_per_second": 2.521, "eval_steps_per_second": 0.328, "step": 167 }, { "epoch": 1.0179640718562875, "grad_norm": 4.53125, "learning_rate": 3.525641025641026e-05, "loss": 1.8939, "mean_token_accuracy": 0.5751068115234375, "num_tokens": 491782.0, "step": 170 }, { "epoch": 1.0778443113772456, "grad_norm": 4.25, "learning_rate": 3.311965811965812e-05, "loss": 1.572, "mean_token_accuracy": 0.6416267931461335, "num_tokens": 521599.0, "step": 180 }, { "epoch": 1.1377245508982037, "grad_norm": 4.71875, "learning_rate": 3.098290598290599e-05, "loss": 1.5429, "mean_token_accuracy": 0.6442601144313812, "num_tokens": 550968.0, "step": 190 }, { "epoch": 1.1976047904191618, "grad_norm": 4.03125, "learning_rate": 2.8846153846153845e-05, "loss": 1.4855, "mean_token_accuracy": 0.6554409444332123, "num_tokens": 579850.0, "step": 200 }, { "epoch": 1.2574850299401197, "grad_norm": 4.53125, "learning_rate": 2.670940170940171e-05, "loss": 1.5925, "mean_token_accuracy": 0.6354905068874359, "num_tokens": 607566.0, "step": 210 }, { "epoch": 1.3173652694610778, "grad_norm": 4.34375, "learning_rate": 2.4572649572649573e-05, "loss": 1.6961, "mean_token_accuracy": 0.611818504333496, "num_tokens": 636366.0, "step": 220 }, { "epoch": 1.377245508982036, "grad_norm": 4.28125, "learning_rate": 2.2435897435897437e-05, "loss": 1.6871, "mean_token_accuracy": 0.6123433768749237, "num_tokens": 665501.0, "step": 230 }, { "epoch": 1.437125748502994, "grad_norm": 5.34375, "learning_rate": 2.02991452991453e-05, "loss": 1.6756, "mean_token_accuracy": 0.6171969532966614, "num_tokens": 692397.0, "step": 240 }, { "epoch": 1.4970059880239521, "grad_norm": 4.40625, "learning_rate": 1.8162393162393162e-05, "loss": 1.6237, "mean_token_accuracy": 0.6245103716850281, "num_tokens": 720330.0, "step": 250 }, { "epoch": 1.55688622754491, "grad_norm": 3.9375, "learning_rate": 1.602564102564103e-05, "loss": 1.6596, "mean_token_accuracy": 0.6197108209133149, "num_tokens": 747976.0, "step": 260 }, { "epoch": 1.6167664670658684, "grad_norm": 4.3125, "learning_rate": 1.388888888888889e-05, "loss": 1.6428, "mean_token_accuracy": 0.6236989557743072, "num_tokens": 776610.0, "step": 270 }, { "epoch": 1.6766467065868262, "grad_norm": 4.125, "learning_rate": 1.1752136752136752e-05, "loss": 1.6659, "mean_token_accuracy": 0.6144291937351227, "num_tokens": 806986.0, "step": 280 }, { "epoch": 1.7365269461077846, "grad_norm": 4.5625, "learning_rate": 9.615384615384616e-06, "loss": 1.6745, "mean_token_accuracy": 0.6144052445888519, "num_tokens": 835571.0, "step": 290 }, { "epoch": 1.7964071856287425, "grad_norm": 4.625, "learning_rate": 7.478632478632479e-06, "loss": 1.6576, "mean_token_accuracy": 0.6180627286434174, "num_tokens": 865294.0, "step": 300 }, { "epoch": 1.8562874251497006, "grad_norm": 4.25, "learning_rate": 5.341880341880342e-06, "loss": 1.6627, "mean_token_accuracy": 0.6169491648674011, "num_tokens": 894249.0, "step": 310 }, { "epoch": 1.9161676646706587, "grad_norm": 4.96875, "learning_rate": 3.205128205128205e-06, "loss": 1.5248, "mean_token_accuracy": 0.6464354753494262, "num_tokens": 924046.0, "step": 320 }, { "epoch": 1.9760479041916168, "grad_norm": 4.09375, "learning_rate": 1.0683760683760685e-06, "loss": 1.6545, "mean_token_accuracy": 0.6218094885349273, "num_tokens": 954354.0, "step": 330 }, { "epoch": 2.0, "eval_loss": 1.941367506980896, "eval_mean_token_accuracy": 0.564078491467696, "eval_num_tokens": 965234.0, "eval_runtime": 39.4675, "eval_samples_per_second": 2.534, "eval_steps_per_second": 0.329, "step": 334 }, { "epoch": 2.035928143712575, "grad_norm": 4.375, "learning_rate": 2.0199501246882794e-05, "loss": 1.3888, "mean_token_accuracy": 0.6779806514581045, "num_tokens": 16794.0, "step": 340 }, { "epoch": 2.095808383233533, "grad_norm": 4.625, "learning_rate": 1.8952618453865337e-05, "loss": 1.5284, "mean_token_accuracy": 0.6481667637825013, "num_tokens": 44874.0, "step": 350 }, { "epoch": 2.155688622754491, "grad_norm": 5.90625, "learning_rate": 1.770573566084788e-05, "loss": 1.5412, "mean_token_accuracy": 0.6448413729667664, "num_tokens": 71727.0, "step": 360 }, { "epoch": 2.215568862275449, "grad_norm": 4.3125, "learning_rate": 1.6458852867830423e-05, "loss": 1.5195, "mean_token_accuracy": 0.6477404713630677, "num_tokens": 101889.0, "step": 370 }, { "epoch": 2.2754491017964074, "grad_norm": 4.0, "learning_rate": 1.5211970074812968e-05, "loss": 1.4846, "mean_token_accuracy": 0.6572466909885406, "num_tokens": 132140.0, "step": 380 }, { "epoch": 2.3353293413173652, "grad_norm": 4.3125, "learning_rate": 1.396508728179551e-05, "loss": 1.604, "mean_token_accuracy": 0.6328544735908508, "num_tokens": 161063.0, "step": 390 }, { "epoch": 2.3952095808383236, "grad_norm": 4.71875, "learning_rate": 1.2718204488778054e-05, "loss": 1.5462, "mean_token_accuracy": 0.6424768209457398, "num_tokens": 189798.0, "step": 400 }, { "epoch": 2.4550898203592815, "grad_norm": 4.34375, "learning_rate": 1.1471321695760599e-05, "loss": 1.5468, "mean_token_accuracy": 0.6386782228946686, "num_tokens": 219194.0, "step": 410 }, { "epoch": 2.5149700598802394, "grad_norm": 4.65625, "learning_rate": 1.0224438902743143e-05, "loss": 1.5713, "mean_token_accuracy": 0.6371028661727905, "num_tokens": 249445.0, "step": 420 }, { "epoch": 2.5748502994011977, "grad_norm": 3.875, "learning_rate": 8.977556109725686e-06, "loss": 1.4073, "mean_token_accuracy": 0.6741897523403168, "num_tokens": 277096.0, "step": 430 }, { "epoch": 2.6347305389221556, "grad_norm": 5.15625, "learning_rate": 7.73067331670823e-06, "loss": 1.5873, "mean_token_accuracy": 0.6345809698104858, "num_tokens": 306656.0, "step": 440 }, { "epoch": 2.694610778443114, "grad_norm": 4.125, "learning_rate": 6.483790523690773e-06, "loss": 1.5398, "mean_token_accuracy": 0.6446067214012146, "num_tokens": 334989.0, "step": 450 }, { "epoch": 2.754491017964072, "grad_norm": 4.78125, "learning_rate": 5.236907730673317e-06, "loss": 1.4324, "mean_token_accuracy": 0.6648351371288299, "num_tokens": 363393.0, "step": 460 }, { "epoch": 2.81437125748503, "grad_norm": 4.375, "learning_rate": 3.99002493765586e-06, "loss": 1.4939, "mean_token_accuracy": 0.6552098572254181, "num_tokens": 392445.0, "step": 470 }, { "epoch": 2.874251497005988, "grad_norm": 4.625, "learning_rate": 2.743142144638404e-06, "loss": 1.5451, "mean_token_accuracy": 0.6401039361953735, "num_tokens": 421549.0, "step": 480 }, { "epoch": 2.934131736526946, "grad_norm": 4.9375, "learning_rate": 1.4962593516209476e-06, "loss": 1.5018, "mean_token_accuracy": 0.6498535394668579, "num_tokens": 450516.0, "step": 490 }, { "epoch": 2.9940119760479043, "grad_norm": 4.8125, "learning_rate": 2.4937655860349126e-07, "loss": 1.502, "mean_token_accuracy": 0.651291674375534, "num_tokens": 480685.0, "step": 500 }, { "epoch": 3.0, "eval_loss": 1.9642236232757568, "eval_mean_token_accuracy": 0.5623479668910687, "eval_num_tokens": 482617.0, "eval_runtime": 39.682, "eval_samples_per_second": 2.52, "eval_steps_per_second": 0.328, "step": 501 } ], "logging_steps": 10, "max_steps": 501, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2647290262044672.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }