{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 556, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4901377972215415, "epoch": 0.017992690469496767, "grad_norm": 2.2936885356903076, "learning_rate": 2.647058823529412e-06, "loss": 1.6211814880371094, "mean_token_accuracy": 0.6704532062634826, "num_tokens": 1320132.0, "step": 10 }, { "entropy": 1.276751457154751, "epoch": 0.03598538093899353, "grad_norm": 0.6068552732467651, "learning_rate": 4.9814471243042675e-06, "loss": 1.2583621978759765, "mean_token_accuracy": 0.7108760349452495, "num_tokens": 2628713.0, "step": 20 }, { "entropy": 1.1512113714590668, "epoch": 0.0539780714084903, "grad_norm": 0.0513942688703537, "learning_rate": 4.888682745825603e-06, "loss": 1.1380729675292969, "mean_token_accuracy": 0.7266728295013308, "num_tokens": 3970015.0, "step": 30 }, { "entropy": 1.1279350664466619, "epoch": 0.07197076187798707, "grad_norm": 0.054850462824106216, "learning_rate": 4.795918367346939e-06, "loss": 1.0975475311279297, "mean_token_accuracy": 0.7291999347507954, "num_tokens": 5293538.0, "step": 40 }, { "entropy": 1.115758778527379, "epoch": 0.08996345234748383, "grad_norm": 0.04872556030750275, "learning_rate": 4.7031539888682745e-06, "loss": 1.0612051010131835, "mean_token_accuracy": 0.7381280666217208, "num_tokens": 6620811.0, "step": 50 }, { "entropy": 1.1105864774435759, "epoch": 0.1079561428169806, "grad_norm": 0.04739998281002045, "learning_rate": 4.610389610389611e-06, "loss": 1.0470812797546387, "mean_token_accuracy": 0.7379071025177837, "num_tokens": 7936154.0, "step": 60 }, { "entropy": 1.0977919282391668, "epoch": 0.12594883328647738, "grad_norm": 0.040797509253025055, "learning_rate": 4.517625231910946e-06, "loss": 1.0206071853637695, "mean_token_accuracy": 0.741416247934103, "num_tokens": 9258443.0, "step": 70 }, { "entropy": 1.0766226774081589, "epoch": 0.14394152375597413, "grad_norm": 0.04117418825626373, "learning_rate": 4.424860853432282e-06, "loss": 1.0037202835083008, "mean_token_accuracy": 0.7426602357998491, "num_tokens": 10559451.0, "step": 80 }, { "entropy": 1.0392444429919123, "epoch": 0.1619342142254709, "grad_norm": 0.03727104142308235, "learning_rate": 4.332096474953618e-06, "loss": 0.9694362640380859, "mean_token_accuracy": 0.7481566898524761, "num_tokens": 11859629.0, "step": 90 }, { "entropy": 1.030888595432043, "epoch": 0.17992690469496767, "grad_norm": 0.0377194844186306, "learning_rate": 4.239332096474954e-06, "loss": 0.9774051666259765, "mean_token_accuracy": 0.7471670845523477, "num_tokens": 13170517.0, "step": 100 }, { "entropy": 0.9908933199942112, "epoch": 0.19791959516446445, "grad_norm": 0.03397062420845032, "learning_rate": 4.14656771799629e-06, "loss": 0.9399270057678223, "mean_token_accuracy": 0.7530543757602572, "num_tokens": 14480196.0, "step": 110 }, { "entropy": 0.9985105341300369, "epoch": 0.2159122856339612, "grad_norm": 0.038795359432697296, "learning_rate": 4.053803339517626e-06, "loss": 0.9471940994262695, "mean_token_accuracy": 0.7546697033569216, "num_tokens": 15807230.0, "step": 120 }, { "entropy": 0.9790718862786889, "epoch": 0.23390497610345798, "grad_norm": 0.03815858066082001, "learning_rate": 3.961038961038962e-06, "loss": 0.925960922241211, "mean_token_accuracy": 0.7591136118397117, "num_tokens": 17157655.0, "step": 130 }, { "entropy": 0.9843037761747837, "epoch": 0.25189766657295476, "grad_norm": 0.03516776114702225, "learning_rate": 3.868274582560297e-06, "loss": 0.9341155052185058, "mean_token_accuracy": 0.7569106232374907, "num_tokens": 18481580.0, "step": 140 }, { "entropy": 0.9783661976456642, "epoch": 0.2698903570424515, "grad_norm": 0.034192971885204315, "learning_rate": 3.7755102040816327e-06, "loss": 0.918891716003418, "mean_token_accuracy": 0.7582158392295242, "num_tokens": 19792039.0, "step": 150 }, { "entropy": 0.9883000548928976, "epoch": 0.28788304751194826, "grad_norm": 0.03616062551736832, "learning_rate": 3.6827458256029685e-06, "loss": 0.9350194931030273, "mean_token_accuracy": 0.7552345667034388, "num_tokens": 21132002.0, "step": 160 }, { "entropy": 0.962575543858111, "epoch": 0.305875737981445, "grad_norm": 0.031624436378479004, "learning_rate": 3.5899814471243043e-06, "loss": 0.9099706649780274, "mean_token_accuracy": 0.7614207146689296, "num_tokens": 22456610.0, "step": 170 }, { "entropy": 0.981575589068234, "epoch": 0.3238684284509418, "grad_norm": 0.03008902259171009, "learning_rate": 3.49721706864564e-06, "loss": 0.9275808334350586, "mean_token_accuracy": 0.7563599238172174, "num_tokens": 23784860.0, "step": 180 }, { "entropy": 0.9543529843911529, "epoch": 0.3418611189204386, "grad_norm": 0.03235575929284096, "learning_rate": 3.404452690166976e-06, "loss": 0.9126798629760742, "mean_token_accuracy": 0.7603292245417833, "num_tokens": 25106610.0, "step": 190 }, { "entropy": 0.9536242228001356, "epoch": 0.35985380938993533, "grad_norm": 0.033603642135858536, "learning_rate": 3.311688311688312e-06, "loss": 0.9094326019287109, "mean_token_accuracy": 0.7603268170729279, "num_tokens": 26404730.0, "step": 200 }, { "entropy": 0.9402435509487986, "epoch": 0.3778464998594321, "grad_norm": 0.029900604858994484, "learning_rate": 3.218923933209648e-06, "loss": 0.8853635787963867, "mean_token_accuracy": 0.7637220246717333, "num_tokens": 27746430.0, "step": 210 }, { "entropy": 0.9270002828910947, "epoch": 0.3958391903289289, "grad_norm": 0.03154909983277321, "learning_rate": 3.1261595547309838e-06, "loss": 0.8845057487487793, "mean_token_accuracy": 0.7643253333866596, "num_tokens": 29091240.0, "step": 220 }, { "entropy": 0.9196253689005971, "epoch": 0.41383188079842564, "grad_norm": 0.028953028842806816, "learning_rate": 3.0333951762523196e-06, "loss": 0.880043888092041, "mean_token_accuracy": 0.7643528375774622, "num_tokens": 30412544.0, "step": 230 }, { "entropy": 0.9138461783528328, "epoch": 0.4318245712679224, "grad_norm": 0.028740836307406425, "learning_rate": 2.9406307977736554e-06, "loss": 0.8804447174072265, "mean_token_accuracy": 0.7650679206475616, "num_tokens": 31721248.0, "step": 240 }, { "entropy": 0.9258439548313617, "epoch": 0.44981726173741915, "grad_norm": 0.027906838804483414, "learning_rate": 2.8478664192949912e-06, "loss": 0.8891608238220214, "mean_token_accuracy": 0.7623051449656486, "num_tokens": 33030621.0, "step": 250 }, { "entropy": 0.9231391252949834, "epoch": 0.46780995220691596, "grad_norm": 0.027720769867300987, "learning_rate": 2.7551020408163266e-06, "loss": 0.9020990371704102, "mean_token_accuracy": 0.7595951380208135, "num_tokens": 34328254.0, "step": 260 }, { "entropy": 0.9248277079313993, "epoch": 0.4858026426764127, "grad_norm": 0.028005970641970634, "learning_rate": 2.6623376623376624e-06, "loss": 0.8968218803405762, "mean_token_accuracy": 0.7620166089385748, "num_tokens": 35639568.0, "step": 270 }, { "entropy": 0.9164260600693523, "epoch": 0.5037953331459095, "grad_norm": 0.025676406919956207, "learning_rate": 2.5695732838589982e-06, "loss": 0.894569206237793, "mean_token_accuracy": 0.7612657260149718, "num_tokens": 36947904.0, "step": 280 }, { "entropy": 0.9089541524648667, "epoch": 0.5217880236154062, "grad_norm": 0.028434382751584053, "learning_rate": 2.476808905380334e-06, "loss": 0.8868412017822266, "mean_token_accuracy": 0.763394633680582, "num_tokens": 38281521.0, "step": 290 }, { "entropy": 0.9049528720788658, "epoch": 0.539780714084903, "grad_norm": 0.02663426101207733, "learning_rate": 2.38404452690167e-06, "loss": 0.8812618255615234, "mean_token_accuracy": 0.7641567781567573, "num_tokens": 39595803.0, "step": 300 }, { "entropy": 0.900223555136472, "epoch": 0.5577734045543997, "grad_norm": 0.026907267048954964, "learning_rate": 2.2912801484230057e-06, "loss": 0.8773960113525391, "mean_token_accuracy": 0.7646851245313883, "num_tokens": 40918054.0, "step": 310 }, { "entropy": 0.9072908268310129, "epoch": 0.5757660950238965, "grad_norm": 0.033084969967603683, "learning_rate": 2.1985157699443415e-06, "loss": 0.8849006652832031, "mean_token_accuracy": 0.7633785914629698, "num_tokens": 42245476.0, "step": 320 }, { "entropy": 0.9075088860467077, "epoch": 0.5937587854933933, "grad_norm": 0.029511412605643272, "learning_rate": 2.1057513914656773e-06, "loss": 0.8799509048461914, "mean_token_accuracy": 0.7644402593374252, "num_tokens": 43592571.0, "step": 330 }, { "entropy": 0.897929747030139, "epoch": 0.61175147596289, "grad_norm": 0.027747338637709618, "learning_rate": 2.012987012987013e-06, "loss": 0.8784950256347657, "mean_token_accuracy": 0.7654943082481622, "num_tokens": 44949762.0, "step": 340 }, { "entropy": 0.8959064597263933, "epoch": 0.6297441664323868, "grad_norm": 0.02585972286760807, "learning_rate": 1.920222634508349e-06, "loss": 0.8677197456359863, "mean_token_accuracy": 0.7666845623403787, "num_tokens": 46266907.0, "step": 350 }, { "entropy": 0.9085025515407323, "epoch": 0.6477368569018837, "grad_norm": 0.026946574449539185, "learning_rate": 1.8274582560296848e-06, "loss": 0.8925327301025391, "mean_token_accuracy": 0.7623184407129884, "num_tokens": 47577598.0, "step": 360 }, { "entropy": 0.8742405578494072, "epoch": 0.6657295473713803, "grad_norm": 0.026929043233394623, "learning_rate": 1.7346938775510206e-06, "loss": 0.8524269104003906, "mean_token_accuracy": 0.7705512259155511, "num_tokens": 48888300.0, "step": 370 }, { "entropy": 0.9005698974244296, "epoch": 0.6837222378408772, "grad_norm": 0.027014046907424927, "learning_rate": 1.6419294990723564e-06, "loss": 0.8712619781494141, "mean_token_accuracy": 0.7643290877342224, "num_tokens": 50229069.0, "step": 380 }, { "entropy": 0.8819140480831266, "epoch": 0.701714928310374, "grad_norm": 0.028174864128232002, "learning_rate": 1.5491651205936922e-06, "loss": 0.8646106719970703, "mean_token_accuracy": 0.7674408122897148, "num_tokens": 51578947.0, "step": 390 }, { "entropy": 0.8925842920318245, "epoch": 0.7197076187798707, "grad_norm": 0.027017617598176003, "learning_rate": 1.456400742115028e-06, "loss": 0.8714614868164062, "mean_token_accuracy": 0.7669254776090384, "num_tokens": 52930805.0, "step": 400 }, { "entropy": 0.889844935759902, "epoch": 0.7377003092493675, "grad_norm": 0.02721812203526497, "learning_rate": 1.3636363636363636e-06, "loss": 0.8674912452697754, "mean_token_accuracy": 0.7662461360916495, "num_tokens": 54224294.0, "step": 410 }, { "entropy": 0.8719520575366915, "epoch": 0.7556929997188642, "grad_norm": 0.028012819588184357, "learning_rate": 1.2708719851576994e-06, "loss": 0.8511224746704101, "mean_token_accuracy": 0.7702083302661776, "num_tokens": 55540584.0, "step": 420 }, { "entropy": 0.8898111075162888, "epoch": 0.773685690188361, "grad_norm": 0.02642475627362728, "learning_rate": 1.1781076066790352e-06, "loss": 0.8730297088623047, "mean_token_accuracy": 0.7653367448598146, "num_tokens": 56827841.0, "step": 430 }, { "entropy": 0.8857162812724709, "epoch": 0.7916783806578578, "grad_norm": 0.02740148827433586, "learning_rate": 1.0853432282003713e-06, "loss": 0.8713733673095703, "mean_token_accuracy": 0.7659575197845697, "num_tokens": 58130682.0, "step": 440 }, { "entropy": 0.8843438906595111, "epoch": 0.8096710711273545, "grad_norm": 0.025668496266007423, "learning_rate": 9.925788497217069e-07, "loss": 0.8760784149169922, "mean_token_accuracy": 0.7651905825361609, "num_tokens": 59444140.0, "step": 450 }, { "entropy": 0.876284147053957, "epoch": 0.8276637615968513, "grad_norm": 0.026019152253866196, "learning_rate": 8.998144712430428e-07, "loss": 0.8590941429138184, "mean_token_accuracy": 0.7688775883987546, "num_tokens": 60778522.0, "step": 460 }, { "entropy": 0.8704025126062334, "epoch": 0.8456564520663481, "grad_norm": 0.024385536089539528, "learning_rate": 8.070500927643786e-07, "loss": 0.8481533050537109, "mean_token_accuracy": 0.7709953064098954, "num_tokens": 62138075.0, "step": 470 }, { "entropy": 0.886689430475235, "epoch": 0.8636491425358448, "grad_norm": 0.027147600427269936, "learning_rate": 7.142857142857143e-07, "loss": 0.8655129432678222, "mean_token_accuracy": 0.7670928187668323, "num_tokens": 63450349.0, "step": 480 }, { "entropy": 0.8841921042650938, "epoch": 0.8816418330053416, "grad_norm": 0.025846796110272408, "learning_rate": 6.215213358070501e-07, "loss": 0.8744302749633789, "mean_token_accuracy": 0.7654220588505268, "num_tokens": 64770576.0, "step": 490 }, { "entropy": 0.8944361335597932, "epoch": 0.8996345234748383, "grad_norm": 0.025025852024555206, "learning_rate": 5.287569573283859e-07, "loss": 0.8789453506469727, "mean_token_accuracy": 0.7639346193522215, "num_tokens": 66113087.0, "step": 500 }, { "entropy": 0.8843724082224071, "epoch": 0.9176272139443351, "grad_norm": 0.02651493437588215, "learning_rate": 4.359925788497217e-07, "loss": 0.8675421714782715, "mean_token_accuracy": 0.7664000844582916, "num_tokens": 67464302.0, "step": 510 }, { "entropy": 0.8899071650579572, "epoch": 0.9356199044138319, "grad_norm": 0.025058092549443245, "learning_rate": 3.4322820037105757e-07, "loss": 0.879638385772705, "mean_token_accuracy": 0.7650359075516462, "num_tokens": 68809443.0, "step": 520 }, { "entropy": 0.8678001549094916, "epoch": 0.9536125948833286, "grad_norm": 0.025574836879968643, "learning_rate": 2.5046382189239333e-07, "loss": 0.8517162322998046, "mean_token_accuracy": 0.7706384485587477, "num_tokens": 70130884.0, "step": 530 }, { "entropy": 0.8980348063632846, "epoch": 0.9716052853528254, "grad_norm": 0.02690030448138714, "learning_rate": 1.5769944341372915e-07, "loss": 0.8926727294921875, "mean_token_accuracy": 0.7621918022632599, "num_tokens": 71446103.0, "step": 540 }, { "entropy": 0.8809462685137988, "epoch": 0.9895979758223222, "grad_norm": 0.02480347640812397, "learning_rate": 6.493506493506495e-08, "loss": 0.8590832710266113, "mean_token_accuracy": 0.7687337175011635, "num_tokens": 72793622.0, "step": 550 } ], "logging_steps": 10, "max_steps": 556, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0599989240114708e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }