{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 658, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1479004077613353, "epoch": 0.015197568389057751, "grad_norm": 0.51171875, "learning_rate": 1.8e-05, "loss": 1.5924851417541503, "mean_token_accuracy": 0.6634366158396006, "num_tokens": 62452.0, "step": 10 }, { "entropy": 1.2071532987058162, "epoch": 0.030395136778115502, "grad_norm": 0.361328125, "learning_rate": 3.8e-05, "loss": 1.5587355613708496, "mean_token_accuracy": 0.6596032619476319, "num_tokens": 125614.0, "step": 20 }, { "entropy": 1.1939050488173961, "epoch": 0.04559270516717325, "grad_norm": 0.40625, "learning_rate": 5.8e-05, "loss": 1.3669007301330567, "mean_token_accuracy": 0.6933483470231294, "num_tokens": 184892.0, "step": 30 }, { "entropy": 1.221787953004241, "epoch": 0.060790273556231005, "grad_norm": 0.353515625, "learning_rate": 7.800000000000001e-05, "loss": 1.23649320602417, "mean_token_accuracy": 0.7031858772039413, "num_tokens": 245953.0, "step": 40 }, { "entropy": 1.077309934794903, "epoch": 0.07598784194528875, "grad_norm": 0.28515625, "learning_rate": 9.8e-05, "loss": 1.0901852607727052, "mean_token_accuracy": 0.7312592580914498, "num_tokens": 306051.0, "step": 50 }, { "entropy": 1.1815281737595797, "epoch": 0.0911854103343465, "grad_norm": 0.2353515625, "learning_rate": 0.000118, "loss": 1.188125991821289, "mean_token_accuracy": 0.7160288453102112, "num_tokens": 366518.0, "step": 60 }, { "entropy": 1.0586148291826247, "epoch": 0.10638297872340426, "grad_norm": 0.2392578125, "learning_rate": 0.000138, "loss": 1.0688477516174317, "mean_token_accuracy": 0.7354599207639694, "num_tokens": 430087.0, "step": 70 }, { "entropy": 1.050880615785718, "epoch": 0.12158054711246201, "grad_norm": 0.3515625, "learning_rate": 0.00015800000000000002, "loss": 1.051170825958252, "mean_token_accuracy": 0.73864571377635, "num_tokens": 491536.0, "step": 80 }, { "entropy": 1.0797226771712303, "epoch": 0.13677811550151975, "grad_norm": 0.298828125, "learning_rate": 0.00017800000000000002, "loss": 1.0682001113891602, "mean_token_accuracy": 0.7340194799005986, "num_tokens": 552928.0, "step": 90 }, { "entropy": 1.0816405173391104, "epoch": 0.1519756838905775, "grad_norm": 0.2333984375, "learning_rate": 0.00019800000000000002, "loss": 1.0748048782348634, "mean_token_accuracy": 0.7275203809142112, "num_tokens": 617585.0, "step": 100 }, { "entropy": 1.1387810289859772, "epoch": 0.16717325227963525, "grad_norm": 0.2421875, "learning_rate": 0.00019987165071710527, "loss": 1.1564626693725586, "mean_token_accuracy": 0.7178827051073313, "num_tokens": 682493.0, "step": 110 }, { "entropy": 1.0470557224005461, "epoch": 0.182370820668693, "grad_norm": 0.263671875, "learning_rate": 0.00019942839715782445, "loss": 1.0679837226867677, "mean_token_accuracy": 0.7393594264984131, "num_tokens": 741585.0, "step": 120 }, { "entropy": 1.022692532464862, "epoch": 0.19756838905775076, "grad_norm": 0.1943359375, "learning_rate": 0.0001986700590805916, "loss": 1.022587490081787, "mean_token_accuracy": 0.7484504476189613, "num_tokens": 805276.0, "step": 130 }, { "entropy": 1.0931762170046568, "epoch": 0.2127659574468085, "grad_norm": 0.2890625, "learning_rate": 0.00019759903962771156, "loss": 1.0889897346496582, "mean_token_accuracy": 0.7326500549912452, "num_tokens": 870605.0, "step": 140 }, { "entropy": 1.046710380911827, "epoch": 0.22796352583586627, "grad_norm": 0.283203125, "learning_rate": 0.00019621873281596092, "loss": 1.088566493988037, "mean_token_accuracy": 0.7342231959104538, "num_tokens": 932551.0, "step": 150 }, { "entropy": 1.0759602926671505, "epoch": 0.24316109422492402, "grad_norm": 0.2265625, "learning_rate": 0.00019453351278108806, "loss": 1.0699708938598633, "mean_token_accuracy": 0.735610119625926, "num_tokens": 991401.0, "step": 160 }, { "entropy": 1.0602354250848294, "epoch": 0.25835866261398177, "grad_norm": 0.2255859375, "learning_rate": 0.00019254871991635598, "loss": 1.0612274169921876, "mean_token_accuracy": 0.7369228135794401, "num_tokens": 1053437.0, "step": 170 }, { "entropy": 1.0833635926246643, "epoch": 0.2735562310030395, "grad_norm": 0.2333984375, "learning_rate": 0.00019027064394905473, "loss": 1.0936810493469238, "mean_token_accuracy": 0.7335428461432457, "num_tokens": 1116692.0, "step": 180 }, { "entropy": 1.0710609000176192, "epoch": 0.2887537993920973, "grad_norm": 0.2470703125, "learning_rate": 0.00018770650400861357, "loss": 1.0741844177246094, "mean_token_accuracy": 0.7365527264773846, "num_tokens": 1176488.0, "step": 190 }, { "entropy": 1.0221609488129615, "epoch": 0.303951367781155, "grad_norm": 0.302734375, "learning_rate": 0.00018486442574947511, "loss": 1.0395893096923827, "mean_token_accuracy": 0.7463255256414414, "num_tokens": 1234621.0, "step": 200 }, { "entropy": 1.028166577219963, "epoch": 0.3191489361702128, "grad_norm": 0.263671875, "learning_rate": 0.0001817534156012295, "loss": 1.0307375907897949, "mean_token_accuracy": 0.7422413423657417, "num_tokens": 1293632.0, "step": 210 }, { "entropy": 1.0808881603181362, "epoch": 0.3343465045592705, "grad_norm": 0.220703125, "learning_rate": 0.00017838333222760792, "loss": 1.0678336143493652, "mean_token_accuracy": 0.7352703791111708, "num_tokens": 1355301.0, "step": 220 }, { "entropy": 1.0897805251181125, "epoch": 0.3495440729483283, "grad_norm": 0.232421875, "learning_rate": 0.00017476485528478093, "loss": 1.0985455513000488, "mean_token_accuracy": 0.7289470963180065, "num_tokens": 1416981.0, "step": 230 }, { "entropy": 1.0927846007049085, "epoch": 0.364741641337386, "grad_norm": 0.23828125, "learning_rate": 0.0001709094515779655, "loss": 1.0999566078186036, "mean_token_accuracy": 0.7316457532346249, "num_tokens": 1477934.0, "step": 240 }, { "entropy": 1.0481801606714725, "epoch": 0.3799392097264438, "grad_norm": 0.2421875, "learning_rate": 0.00016682933872358912, "loss": 1.0316462516784668, "mean_token_accuracy": 0.7426637083292007, "num_tokens": 1539241.0, "step": 250 }, { "entropy": 1.0000269718468189, "epoch": 0.3951367781155015, "grad_norm": 0.255859375, "learning_rate": 0.00016253744643216368, "loss": 1.0046791076660155, "mean_token_accuracy": 0.747460788488388, "num_tokens": 1604048.0, "step": 260 }, { "entropy": 1.0748730458319187, "epoch": 0.41033434650455924, "grad_norm": 0.236328125, "learning_rate": 0.0001580473755345625, "loss": 1.0607515335083009, "mean_token_accuracy": 0.7351726233959198, "num_tokens": 1666904.0, "step": 270 }, { "entropy": 1.059519186615944, "epoch": 0.425531914893617, "grad_norm": 0.224609375, "learning_rate": 0.00015337335488154431, "loss": 1.0550406455993653, "mean_token_accuracy": 0.7438629917800427, "num_tokens": 1725233.0, "step": 280 }, { "entropy": 1.049350445717573, "epoch": 0.44072948328267475, "grad_norm": 0.203125, "learning_rate": 0.00014853019625310813, "loss": 1.0740507125854493, "mean_token_accuracy": 0.7419425651431084, "num_tokens": 1788516.0, "step": 290 }, { "entropy": 1.0794111423194408, "epoch": 0.45592705167173253, "grad_norm": 0.2734375, "learning_rate": 0.000143533247420569, "loss": 1.0830384254455567, "mean_token_accuracy": 0.7354968316853047, "num_tokens": 1849950.0, "step": 300 }, { "entropy": 1.0809252394363285, "epoch": 0.47112462006079026, "grad_norm": 0.271484375, "learning_rate": 0.00013839834351009954, "loss": 1.106314754486084, "mean_token_accuracy": 0.7333131659775972, "num_tokens": 1911826.0, "step": 310 }, { "entropy": 1.0794141918420792, "epoch": 0.48632218844984804, "grad_norm": 0.251953125, "learning_rate": 0.0001331417568218636, "loss": 1.080414390563965, "mean_token_accuracy": 0.7330277658998966, "num_tokens": 1971255.0, "step": 320 }, { "entropy": 1.0166626147925855, "epoch": 0.5015197568389058, "grad_norm": 0.2412109375, "learning_rate": 0.00012778014526376353, "loss": 1.0223579406738281, "mean_token_accuracy": 0.7451204493641853, "num_tokens": 2035666.0, "step": 330 }, { "entropy": 1.0680379424244166, "epoch": 0.5167173252279635, "grad_norm": 0.21875, "learning_rate": 0.0001223304995632124, "loss": 1.063144588470459, "mean_token_accuracy": 0.7386665888130665, "num_tokens": 2097745.0, "step": 340 }, { "entropy": 1.1010748460888862, "epoch": 0.5319148936170213, "grad_norm": 0.265625, "learning_rate": 0.00011681008942421483, "loss": 1.1296490669250487, "mean_token_accuracy": 0.7327280201017856, "num_tokens": 2154975.0, "step": 350 }, { "entropy": 1.0673069586977362, "epoch": 0.547112462006079, "grad_norm": 0.2392578125, "learning_rate": 0.00011123640880038233, "loss": 1.0613948822021484, "mean_token_accuracy": 0.7358887560665608, "num_tokens": 2218174.0, "step": 360 }, { "entropy": 1.120655293017626, "epoch": 0.5623100303951368, "grad_norm": 0.2255859375, "learning_rate": 0.00010562712045731084, "loss": 1.1228485107421875, "mean_token_accuracy": 0.7270086117088794, "num_tokens": 2282763.0, "step": 370 }, { "entropy": 1.023940760269761, "epoch": 0.5775075987841946, "grad_norm": 0.2197265625, "learning_rate": 0.0001, "loss": 1.01791353225708, "mean_token_accuracy": 0.7478193882852793, "num_tokens": 2342406.0, "step": 380 }, { "entropy": 1.0468089915812016, "epoch": 0.5927051671732523, "grad_norm": 0.296875, "learning_rate": 9.43728795426892e-05, "loss": 1.049548053741455, "mean_token_accuracy": 0.742385634034872, "num_tokens": 2403644.0, "step": 390 }, { "entropy": 1.0749794770032168, "epoch": 0.60790273556231, "grad_norm": 0.224609375, "learning_rate": 8.87635911996177e-05, "loss": 1.0873553276062011, "mean_token_accuracy": 0.7330629404634237, "num_tokens": 2466781.0, "step": 400 }, { "entropy": 1.0800842259079217, "epoch": 0.6231003039513677, "grad_norm": 0.2734375, "learning_rate": 8.31899105757852e-05, "loss": 1.0950210571289063, "mean_token_accuracy": 0.7357834167778492, "num_tokens": 2529775.0, "step": 410 }, { "entropy": 1.0835391595959663, "epoch": 0.6382978723404256, "grad_norm": 0.2734375, "learning_rate": 7.766950043678764e-05, "loss": 1.0691117286682128, "mean_token_accuracy": 0.7371876426041126, "num_tokens": 2593744.0, "step": 420 }, { "entropy": 1.0298739653080702, "epoch": 0.6534954407294833, "grad_norm": 0.240234375, "learning_rate": 7.221985473623654e-05, "loss": 1.0413244247436524, "mean_token_accuracy": 0.7439139492809772, "num_tokens": 2656829.0, "step": 430 }, { "entropy": 1.1207933265715837, "epoch": 0.668693009118541, "grad_norm": 0.244140625, "learning_rate": 6.685824317813643e-05, "loss": 1.136990737915039, "mean_token_accuracy": 0.7278301935642958, "num_tokens": 2719001.0, "step": 440 }, { "entropy": 1.0710795857012272, "epoch": 0.6838905775075987, "grad_norm": 0.263671875, "learning_rate": 6.160165648990048e-05, "loss": 1.0599514961242675, "mean_token_accuracy": 0.7398943588137626, "num_tokens": 2783541.0, "step": 450 }, { "entropy": 1.0919535238295794, "epoch": 0.6990881458966566, "grad_norm": 0.609375, "learning_rate": 5.6466752579431016e-05, "loss": 1.0944193840026855, "mean_token_accuracy": 0.7346091568470001, "num_tokens": 2845061.0, "step": 460 }, { "entropy": 1.0125693645328284, "epoch": 0.7142857142857143, "grad_norm": 0.2470703125, "learning_rate": 5.146980374689192e-05, "loss": 1.0110005378723144, "mean_token_accuracy": 0.743566332012415, "num_tokens": 2905588.0, "step": 470 }, { "entropy": 1.0804476391524076, "epoch": 0.729483282674772, "grad_norm": 0.224609375, "learning_rate": 4.662664511845568e-05, "loss": 1.065206813812256, "mean_token_accuracy": 0.7375747956335544, "num_tokens": 2969406.0, "step": 480 }, { "entropy": 1.076500639691949, "epoch": 0.7446808510638298, "grad_norm": 0.2109375, "learning_rate": 4.195262446543753e-05, "loss": 1.0726984977722167, "mean_token_accuracy": 0.7347712397575379, "num_tokens": 3033325.0, "step": 490 }, { "entropy": 1.0519217938184737, "epoch": 0.7598784194528876, "grad_norm": 0.2373046875, "learning_rate": 3.746255356783632e-05, "loss": 1.0575304985046388, "mean_token_accuracy": 0.7421793609857559, "num_tokens": 3096358.0, "step": 500 }, { "entropy": 1.1361971575766803, "epoch": 0.7750759878419453, "grad_norm": 0.26953125, "learning_rate": 3.317066127641091e-05, "loss": 1.14003267288208, "mean_token_accuracy": 0.7234445497393608, "num_tokens": 3157442.0, "step": 510 }, { "entropy": 1.0398458503186703, "epoch": 0.790273556231003, "grad_norm": 0.263671875, "learning_rate": 2.9090548422034525e-05, "loss": 1.0192261695861817, "mean_token_accuracy": 0.7409338817000389, "num_tokens": 3217105.0, "step": 520 }, { "entropy": 1.0723623022437097, "epoch": 0.8054711246200608, "grad_norm": 0.208984375, "learning_rate": 2.523514471521913e-05, "loss": 1.0806448936462403, "mean_token_accuracy": 0.739747503399849, "num_tokens": 3279648.0, "step": 530 }, { "entropy": 1.0766091130673885, "epoch": 0.8206686930091185, "grad_norm": 0.2578125, "learning_rate": 2.1616667772392074e-05, "loss": 1.0830445289611816, "mean_token_accuracy": 0.7354145631194114, "num_tokens": 3340664.0, "step": 540 }, { "entropy": 1.0176336735486984, "epoch": 0.8358662613981763, "grad_norm": 0.2294921875, "learning_rate": 1.8246584398770493e-05, "loss": 1.0089756965637207, "mean_token_accuracy": 0.7476604901254177, "num_tokens": 3403315.0, "step": 550 }, { "entropy": 1.1855169147253037, "epoch": 0.851063829787234, "grad_norm": 0.251953125, "learning_rate": 1.5135574250524897e-05, "loss": 1.2196799278259278, "mean_token_accuracy": 0.711599162966013, "num_tokens": 3465413.0, "step": 560 }, { "entropy": 1.0133992433547974, "epoch": 0.8662613981762918, "grad_norm": 0.22265625, "learning_rate": 1.229349599138645e-05, "loss": 1.0138180732727051, "mean_token_accuracy": 0.7445756837725639, "num_tokens": 3527634.0, "step": 570 }, { "entropy": 1.0065228387713432, "epoch": 0.8814589665653495, "grad_norm": 0.2470703125, "learning_rate": 9.729356050945271e-06, "loss": 1.0059442520141602, "mean_token_accuracy": 0.7528380408883095, "num_tokens": 3589816.0, "step": 580 }, { "entropy": 1.026823963969946, "epoch": 0.8966565349544073, "grad_norm": 0.2265625, "learning_rate": 7.4512800836440525e-06, "loss": 1.0102774620056152, "mean_token_accuracy": 0.7461944825947284, "num_tokens": 3656572.0, "step": 590 }, { "entropy": 1.041152635589242, "epoch": 0.9118541033434651, "grad_norm": 0.2333984375, "learning_rate": 5.466487218911942e-06, "loss": 1.0332704544067384, "mean_token_accuracy": 0.7453692108392715, "num_tokens": 3720528.0, "step": 600 }, { "entropy": 1.0129390254616737, "epoch": 0.9270516717325228, "grad_norm": 0.259765625, "learning_rate": 3.7812671840390835e-06, "loss": 1.0002843856811523, "mean_token_accuracy": 0.7472102656960488, "num_tokens": 3784254.0, "step": 610 }, { "entropy": 1.0242063857614994, "epoch": 0.9422492401215805, "grad_norm": 0.265625, "learning_rate": 2.4009603722884742e-06, "loss": 0.9934074401855468, "mean_token_accuracy": 0.7479867108166218, "num_tokens": 3845304.0, "step": 620 }, { "entropy": 1.023862723633647, "epoch": 0.9574468085106383, "grad_norm": 0.20703125, "learning_rate": 1.3299409194084122e-06, "loss": 1.0128738403320312, "mean_token_accuracy": 0.7516457572579384, "num_tokens": 3910188.0, "step": 630 }, { "entropy": 1.0282081000506877, "epoch": 0.9726443768996961, "grad_norm": 0.2001953125, "learning_rate": 5.716028421755671e-07, "loss": 1.0188997268676758, "mean_token_accuracy": 0.7448701910674572, "num_tokens": 3973174.0, "step": 640 }, { "entropy": 1.0912907514721155, "epoch": 0.9878419452887538, "grad_norm": 0.275390625, "learning_rate": 1.2834928289472416e-07, "loss": 1.1018651962280273, "mean_token_accuracy": 0.7346004512161016, "num_tokens": 4038046.0, "step": 650 } ], "logging_steps": 10, "max_steps": 658, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.882623888257843e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }