{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 37040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08099352051835854, "grad_norm": 1.2232078313827515, "learning_rate": 8.08108108108108e-05, "loss": 8.4372, "step": 300 }, { "epoch": 0.16198704103671707, "grad_norm": 1.2085011005401611, "learning_rate": 9.9375511317153e-05, "loss": 7.116, "step": 600 }, { "epoch": 0.24298056155507558, "grad_norm": 0.9084240794181824, "learning_rate": 9.855740387237524e-05, "loss": 6.8316, "step": 900 }, { "epoch": 0.32397408207343414, "grad_norm": 0.9802172780036926, "learning_rate": 9.77392964275975e-05, "loss": 6.6389, "step": 1200 }, { "epoch": 0.40496760259179265, "grad_norm": 1.2083544731140137, "learning_rate": 9.692118898281974e-05, "loss": 6.5172, "step": 1500 }, { "epoch": 0.48596112311015116, "grad_norm": 1.1617285013198853, "learning_rate": 9.610308153804201e-05, "loss": 6.3892, "step": 1800 }, { "epoch": 0.5669546436285097, "grad_norm": 1.1057401895523071, "learning_rate": 9.528497409326425e-05, "loss": 6.2733, "step": 2100 }, { "epoch": 0.6479481641468683, "grad_norm": 1.1652714014053345, "learning_rate": 9.446686664848651e-05, "loss": 6.1542, "step": 2400 }, { "epoch": 0.7289416846652268, "grad_norm": 1.1327797174453735, "learning_rate": 9.364875920370875e-05, "loss": 6.0642, "step": 2700 }, { "epoch": 0.8099352051835853, "grad_norm": 1.1129639148712158, "learning_rate": 9.283065175893101e-05, "loss": 5.9595, "step": 3000 }, { "epoch": 0.8909287257019438, "grad_norm": 1.1772035360336304, "learning_rate": 9.201254431415327e-05, "loss": 5.8886, "step": 3300 }, { "epoch": 0.9719222462203023, "grad_norm": 1.1345826387405396, "learning_rate": 9.119443686937551e-05, "loss": 5.8212, "step": 3600 }, { "epoch": 1.0529157667386608, "grad_norm": 1.1569806337356567, "learning_rate": 9.037632942459777e-05, "loss": 5.6998, "step": 3900 }, { "epoch": 1.1339092872570196, "grad_norm": 1.2677946090698242, "learning_rate": 8.955822197982002e-05, "loss": 5.6339, "step": 4200 }, { "epoch": 1.214902807775378, "grad_norm": 1.2447303533554077, "learning_rate": 8.874011453504228e-05, "loss": 5.6141, "step": 4500 }, { "epoch": 1.2958963282937366, "grad_norm": 1.2059963941574097, "learning_rate": 8.792200709026452e-05, "loss": 5.5655, "step": 4800 }, { "epoch": 1.376889848812095, "grad_norm": 1.2978270053863525, "learning_rate": 8.710389964548678e-05, "loss": 5.5272, "step": 5100 }, { "epoch": 1.4578833693304536, "grad_norm": 1.2664310932159424, "learning_rate": 8.628579220070902e-05, "loss": 5.476, "step": 5400 }, { "epoch": 1.538876889848812, "grad_norm": 1.2178924083709717, "learning_rate": 8.546768475593129e-05, "loss": 5.4346, "step": 5700 }, { "epoch": 1.6198704103671706, "grad_norm": 1.266420841217041, "learning_rate": 8.464957731115354e-05, "loss": 5.4054, "step": 6000 }, { "epoch": 1.7008639308855291, "grad_norm": 1.23899507522583, "learning_rate": 8.383146986637579e-05, "loss": 5.3561, "step": 6300 }, { "epoch": 1.7818574514038876, "grad_norm": 1.3659826517105103, "learning_rate": 8.301336242159804e-05, "loss": 5.3211, "step": 6600 }, { "epoch": 1.8628509719222461, "grad_norm": 1.2756917476654053, "learning_rate": 8.219525497682029e-05, "loss": 5.2925, "step": 6900 }, { "epoch": 1.9438444924406046, "grad_norm": 1.329283356666565, "learning_rate": 8.137714753204254e-05, "loss": 5.2797, "step": 7200 }, { "epoch": 2.024838012958963, "grad_norm": 1.422574758529663, "learning_rate": 8.05590400872648e-05, "loss": 5.1991, "step": 7500 }, { "epoch": 2.1058315334773217, "grad_norm": 1.2917046546936035, "learning_rate": 7.974093264248705e-05, "loss": 5.0749, "step": 7800 }, { "epoch": 2.18682505399568, "grad_norm": 1.3476394414901733, "learning_rate": 7.89228251977093e-05, "loss": 5.0792, "step": 8100 }, { "epoch": 2.267818574514039, "grad_norm": 1.3360469341278076, "learning_rate": 7.810471775293156e-05, "loss": 5.0675, "step": 8400 }, { "epoch": 2.3488120950323976, "grad_norm": 1.4274685382843018, "learning_rate": 7.72866103081538e-05, "loss": 5.0484, "step": 8700 }, { "epoch": 2.429805615550756, "grad_norm": 1.3693740367889404, "learning_rate": 7.646850286337606e-05, "loss": 5.0288, "step": 9000 }, { "epoch": 2.5107991360691146, "grad_norm": 1.4013055562973022, "learning_rate": 7.56503954185983e-05, "loss": 5.0148, "step": 9300 }, { "epoch": 2.591792656587473, "grad_norm": 1.374281406402588, "learning_rate": 7.483228797382058e-05, "loss": 4.9978, "step": 9600 }, { "epoch": 2.6727861771058317, "grad_norm": 1.4153590202331543, "learning_rate": 7.401418052904282e-05, "loss": 4.9705, "step": 9900 }, { "epoch": 2.75377969762419, "grad_norm": 1.3654042482376099, "learning_rate": 7.319607308426508e-05, "loss": 4.9599, "step": 10200 }, { "epoch": 2.8347732181425487, "grad_norm": 1.3114956617355347, "learning_rate": 7.237796563948732e-05, "loss": 4.9379, "step": 10500 }, { "epoch": 2.915766738660907, "grad_norm": 1.4419208765029907, "learning_rate": 7.155985819470958e-05, "loss": 4.9319, "step": 10800 }, { "epoch": 2.9967602591792657, "grad_norm": 1.3786503076553345, "learning_rate": 7.074175074993182e-05, "loss": 4.894, "step": 11100 }, { "epoch": 3.077753779697624, "grad_norm": 1.406146764755249, "learning_rate": 6.992364330515409e-05, "loss": 4.731, "step": 11400 }, { "epoch": 3.1587473002159827, "grad_norm": 1.4777101278305054, "learning_rate": 6.910553586037633e-05, "loss": 4.723, "step": 11700 }, { "epoch": 3.239740820734341, "grad_norm": 1.5643523931503296, "learning_rate": 6.828742841559859e-05, "loss": 4.7243, "step": 12000 }, { "epoch": 3.3207343412526997, "grad_norm": 1.4379490613937378, "learning_rate": 6.746932097082083e-05, "loss": 4.7216, "step": 12300 }, { "epoch": 3.4017278617710582, "grad_norm": 1.5200529098510742, "learning_rate": 6.665121352604309e-05, "loss": 4.7187, "step": 12600 }, { "epoch": 3.4827213822894167, "grad_norm": 1.573749303817749, "learning_rate": 6.583310608126535e-05, "loss": 4.7152, "step": 12900 }, { "epoch": 3.5637149028077753, "grad_norm": 1.555275559425354, "learning_rate": 6.50149986364876e-05, "loss": 4.6872, "step": 13200 }, { "epoch": 3.6447084233261338, "grad_norm": 1.5814576148986816, "learning_rate": 6.419689119170985e-05, "loss": 4.678, "step": 13500 }, { "epoch": 3.7257019438444923, "grad_norm": 1.4575761556625366, "learning_rate": 6.33787837469321e-05, "loss": 4.6898, "step": 13800 }, { "epoch": 3.806695464362851, "grad_norm": 1.4241302013397217, "learning_rate": 6.256067630215436e-05, "loss": 4.6746, "step": 14100 }, { "epoch": 3.8876889848812093, "grad_norm": 1.4637110233306885, "learning_rate": 6.17425688573766e-05, "loss": 4.6634, "step": 14400 }, { "epoch": 3.968682505399568, "grad_norm": 1.5187498331069946, "learning_rate": 6.092446141259885e-05, "loss": 4.6632, "step": 14700 }, { "epoch": 4.049676025917926, "grad_norm": 1.6040258407592773, "learning_rate": 6.010635396782111e-05, "loss": 4.557, "step": 15000 }, { "epoch": 4.130669546436285, "grad_norm": 1.6472535133361816, "learning_rate": 5.9288246523043365e-05, "loss": 4.4817, "step": 15300 }, { "epoch": 4.211663066954643, "grad_norm": 1.6630454063415527, "learning_rate": 5.8470139078265615e-05, "loss": 4.4911, "step": 15600 }, { "epoch": 4.292656587473002, "grad_norm": 1.5368226766586304, "learning_rate": 5.765203163348787e-05, "loss": 4.4818, "step": 15900 }, { "epoch": 4.37365010799136, "grad_norm": 1.6154084205627441, "learning_rate": 5.683392418871012e-05, "loss": 4.4715, "step": 16200 }, { "epoch": 4.454643628509719, "grad_norm": 1.5402164459228516, "learning_rate": 5.601581674393237e-05, "loss": 4.4817, "step": 16500 }, { "epoch": 4.535637149028078, "grad_norm": 1.6153959035873413, "learning_rate": 5.519770929915462e-05, "loss": 4.4711, "step": 16800 }, { "epoch": 4.616630669546437, "grad_norm": 1.5631442070007324, "learning_rate": 5.4379601854376885e-05, "loss": 4.463, "step": 17100 }, { "epoch": 4.697624190064795, "grad_norm": 1.620991587638855, "learning_rate": 5.3561494409599135e-05, "loss": 4.4621, "step": 17400 }, { "epoch": 4.778617710583154, "grad_norm": 1.742760181427002, "learning_rate": 5.2743386964821385e-05, "loss": 4.46, "step": 17700 }, { "epoch": 4.859611231101512, "grad_norm": 1.6621677875518799, "learning_rate": 5.1925279520043635e-05, "loss": 4.4725, "step": 18000 }, { "epoch": 4.940604751619871, "grad_norm": 1.553416132926941, "learning_rate": 5.1107172075265885e-05, "loss": 4.4523, "step": 18300 }, { "epoch": 5.021598272138229, "grad_norm": 1.6929900646209717, "learning_rate": 5.0289064630488135e-05, "loss": 4.402, "step": 18600 }, { "epoch": 5.102591792656588, "grad_norm": 1.6716270446777344, "learning_rate": 4.947095718571039e-05, "loss": 4.2694, "step": 18900 }, { "epoch": 5.183585313174946, "grad_norm": 1.7076847553253174, "learning_rate": 4.865284974093264e-05, "loss": 4.2792, "step": 19200 }, { "epoch": 5.264578833693305, "grad_norm": 1.6706926822662354, "learning_rate": 4.78347422961549e-05, "loss": 4.2939, "step": 19500 }, { "epoch": 5.345572354211663, "grad_norm": 1.7526054382324219, "learning_rate": 4.701663485137715e-05, "loss": 4.2971, "step": 19800 }, { "epoch": 5.426565874730022, "grad_norm": 1.7769668102264404, "learning_rate": 4.61985274065994e-05, "loss": 4.2927, "step": 20100 }, { "epoch": 5.50755939524838, "grad_norm": 1.736707091331482, "learning_rate": 4.5380419961821655e-05, "loss": 4.3044, "step": 20400 }, { "epoch": 5.588552915766739, "grad_norm": 1.7048243284225464, "learning_rate": 4.4562312517043905e-05, "loss": 4.2971, "step": 20700 }, { "epoch": 5.669546436285097, "grad_norm": 1.7873897552490234, "learning_rate": 4.374420507226616e-05, "loss": 4.2918, "step": 21000 }, { "epoch": 5.750539956803456, "grad_norm": 1.6965153217315674, "learning_rate": 4.292609762748841e-05, "loss": 4.279, "step": 21300 }, { "epoch": 5.831533477321814, "grad_norm": 1.6777267456054688, "learning_rate": 4.210799018271067e-05, "loss": 4.3005, "step": 21600 }, { "epoch": 5.912526997840173, "grad_norm": 1.7366621494293213, "learning_rate": 4.128988273793292e-05, "loss": 4.288, "step": 21900 }, { "epoch": 5.993520518358531, "grad_norm": 1.6858952045440674, "learning_rate": 4.047177529315517e-05, "loss": 4.3032, "step": 22200 }, { "epoch": 6.07451403887689, "grad_norm": 1.755344033241272, "learning_rate": 3.9653667848377425e-05, "loss": 4.1395, "step": 22500 }, { "epoch": 6.155507559395248, "grad_norm": 1.8167041540145874, "learning_rate": 3.8835560403599675e-05, "loss": 4.1397, "step": 22800 }, { "epoch": 6.236501079913607, "grad_norm": 1.777757167816162, "learning_rate": 3.8017452958821925e-05, "loss": 4.1328, "step": 23100 }, { "epoch": 6.317494600431965, "grad_norm": 1.8104006052017212, "learning_rate": 3.719934551404418e-05, "loss": 4.1485, "step": 23400 }, { "epoch": 6.398488120950324, "grad_norm": 2.2477872371673584, "learning_rate": 3.638123806926643e-05, "loss": 4.1426, "step": 23700 }, { "epoch": 6.479481641468682, "grad_norm": 1.884135365486145, "learning_rate": 3.556313062448868e-05, "loss": 4.1408, "step": 24000 }, { "epoch": 6.560475161987041, "grad_norm": 1.9256831407546997, "learning_rate": 3.474502317971094e-05, "loss": 4.1442, "step": 24300 }, { "epoch": 6.6414686825053995, "grad_norm": 1.838258981704712, "learning_rate": 3.392691573493319e-05, "loss": 4.146, "step": 24600 }, { "epoch": 6.722462203023758, "grad_norm": 1.8604072332382202, "learning_rate": 3.310880829015544e-05, "loss": 4.1371, "step": 24900 }, { "epoch": 6.8034557235421165, "grad_norm": 1.8495209217071533, "learning_rate": 3.2290700845377695e-05, "loss": 4.1388, "step": 25200 }, { "epoch": 6.884449244060475, "grad_norm": 1.8702112436294556, "learning_rate": 3.1472593400599945e-05, "loss": 4.1355, "step": 25500 }, { "epoch": 6.9654427645788335, "grad_norm": 1.8808040618896484, "learning_rate": 3.06544859558222e-05, "loss": 4.1427, "step": 25800 }, { "epoch": 7.046436285097192, "grad_norm": 1.8303903341293335, "learning_rate": 2.9836378511044448e-05, "loss": 4.0723, "step": 26100 }, { "epoch": 7.1274298056155505, "grad_norm": 1.9697943925857544, "learning_rate": 2.9018271066266705e-05, "loss": 3.9898, "step": 26400 }, { "epoch": 7.208423326133909, "grad_norm": 1.998304843902588, "learning_rate": 2.8200163621488955e-05, "loss": 3.997, "step": 26700 }, { "epoch": 7.2894168466522675, "grad_norm": 1.962162971496582, "learning_rate": 2.7382056176711208e-05, "loss": 4.009, "step": 27000 }, { "epoch": 7.370410367170626, "grad_norm": 1.8969253301620483, "learning_rate": 2.6563948731933465e-05, "loss": 4.0181, "step": 27300 }, { "epoch": 7.4514038876889845, "grad_norm": 1.9282795190811157, "learning_rate": 2.5745841287155715e-05, "loss": 4.015, "step": 27600 }, { "epoch": 7.532397408207343, "grad_norm": 1.9588371515274048, "learning_rate": 2.4927733842377968e-05, "loss": 4.0216, "step": 27900 }, { "epoch": 7.613390928725702, "grad_norm": 2.0260281562805176, "learning_rate": 2.4109626397600218e-05, "loss": 4.0161, "step": 28200 }, { "epoch": 7.69438444924406, "grad_norm": 1.9617438316345215, "learning_rate": 2.329151895282247e-05, "loss": 4.0177, "step": 28500 }, { "epoch": 7.775377969762419, "grad_norm": 1.9317694902420044, "learning_rate": 2.2473411508044725e-05, "loss": 4.0203, "step": 28800 }, { "epoch": 7.856371490280777, "grad_norm": 2.1097474098205566, "learning_rate": 2.1655304063266975e-05, "loss": 4.0226, "step": 29100 }, { "epoch": 7.937365010799136, "grad_norm": 1.9996739625930786, "learning_rate": 2.083719661848923e-05, "loss": 4.0139, "step": 29400 }, { "epoch": 8.018358531317494, "grad_norm": 1.9552192687988281, "learning_rate": 2.001908917371148e-05, "loss": 3.9961, "step": 29700 }, { "epoch": 8.099352051835853, "grad_norm": 1.9899965524673462, "learning_rate": 1.9200981728933735e-05, "loss": 3.8958, "step": 30000 }, { "epoch": 8.180345572354211, "grad_norm": 2.0817723274230957, "learning_rate": 1.8382874284155988e-05, "loss": 3.887, "step": 30300 }, { "epoch": 8.26133909287257, "grad_norm": 2.086763858795166, "learning_rate": 1.7564766839378238e-05, "loss": 3.9098, "step": 30600 }, { "epoch": 8.342332613390928, "grad_norm": 2.0301589965820312, "learning_rate": 1.674665939460049e-05, "loss": 3.8974, "step": 30900 }, { "epoch": 8.423326133909287, "grad_norm": 2.0822806358337402, "learning_rate": 1.5928551949822745e-05, "loss": 3.9106, "step": 31200 }, { "epoch": 8.504319654427645, "grad_norm": 2.1793293952941895, "learning_rate": 1.5110444505044996e-05, "loss": 3.9102, "step": 31500 }, { "epoch": 8.585313174946004, "grad_norm": 2.0697715282440186, "learning_rate": 1.429233706026725e-05, "loss": 3.9044, "step": 31800 }, { "epoch": 8.666306695464362, "grad_norm": 2.0902247428894043, "learning_rate": 1.34742296154895e-05, "loss": 3.9191, "step": 32100 }, { "epoch": 8.74730021598272, "grad_norm": 2.062403917312622, "learning_rate": 1.2656122170711755e-05, "loss": 3.9175, "step": 32400 }, { "epoch": 8.82829373650108, "grad_norm": 2.048679828643799, "learning_rate": 1.1838014725934006e-05, "loss": 3.9124, "step": 32700 }, { "epoch": 8.909287257019438, "grad_norm": 2.103947401046753, "learning_rate": 1.101990728115626e-05, "loss": 3.9082, "step": 33000 }, { "epoch": 8.990280777537796, "grad_norm": 2.0932586193084717, "learning_rate": 1.0201799836378511e-05, "loss": 3.9139, "step": 33300 }, { "epoch": 9.071274298056155, "grad_norm": 2.065955877304077, "learning_rate": 9.383692391600765e-06, "loss": 3.8253, "step": 33600 }, { "epoch": 9.152267818574513, "grad_norm": 2.127322196960449, "learning_rate": 8.565584946823016e-06, "loss": 3.8279, "step": 33900 }, { "epoch": 9.233261339092872, "grad_norm": 2.0906572341918945, "learning_rate": 7.74747750204527e-06, "loss": 3.826, "step": 34200 }, { "epoch": 9.314254859611232, "grad_norm": 2.142390489578247, "learning_rate": 6.929370057267521e-06, "loss": 3.8186, "step": 34500 }, { "epoch": 9.39524838012959, "grad_norm": 2.0886802673339844, "learning_rate": 6.111262612489774e-06, "loss": 3.829, "step": 34800 }, { "epoch": 9.476241900647949, "grad_norm": 2.1843161582946777, "learning_rate": 5.293155167712026e-06, "loss": 3.8274, "step": 35100 }, { "epoch": 9.557235421166308, "grad_norm": 2.1167829036712646, "learning_rate": 4.475047722934279e-06, "loss": 3.8244, "step": 35400 }, { "epoch": 9.638228941684666, "grad_norm": 2.1455206871032715, "learning_rate": 3.6569402781565316e-06, "loss": 3.8478, "step": 35700 }, { "epoch": 9.719222462203025, "grad_norm": 2.0749266147613525, "learning_rate": 2.838832833378784e-06, "loss": 3.8233, "step": 36000 }, { "epoch": 9.800215982721383, "grad_norm": 2.1683921813964844, "learning_rate": 2.0207253886010365e-06, "loss": 3.8196, "step": 36300 }, { "epoch": 9.881209503239742, "grad_norm": 2.2705883979797363, "learning_rate": 1.2026179438232888e-06, "loss": 3.8316, "step": 36600 }, { "epoch": 9.9622030237581, "grad_norm": 2.1019952297210693, "learning_rate": 3.8451049904554137e-07, "loss": 3.821, "step": 36900 } ], "logging_steps": 300, "max_steps": 37040, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.741560324096e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }