675 lines
19 KiB
JSON
675 lines
19 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9862844814301125,
|
|
"eval_steps": 500,
|
|
"global_step": 1600,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 5.028895375132561,
|
|
"epoch": 0.015410695022345508,
|
|
"grad_norm": 6.3125,
|
|
"learning_rate": 4.897959183673469e-06,
|
|
"loss": 7.5902880859375,
|
|
"mean_token_accuracy": 0.09856362253893167,
|
|
"num_tokens": 132392.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 5.056147763133049,
|
|
"epoch": 0.030821390044691015,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 4.8427432250976565,
|
|
"mean_token_accuracy": 0.33738345025107264,
|
|
"num_tokens": 254164.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 3.3333908554911615,
|
|
"epoch": 0.04623208506703652,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 9.841168996188057e-06,
|
|
"loss": 3.2706063842773436,
|
|
"mean_token_accuracy": 0.5363785127736628,
|
|
"num_tokens": 383737.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 2.5955499114096163,
|
|
"epoch": 0.06164278008938203,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 9.682337992376113e-06,
|
|
"loss": 2.6094720458984373,
|
|
"mean_token_accuracy": 0.631682768985629,
|
|
"num_tokens": 516457.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 2.3073494301736357,
|
|
"epoch": 0.07705347511172754,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 9.523506988564168e-06,
|
|
"loss": 2.3903713989257813,
|
|
"mean_token_accuracy": 0.6567921816185117,
|
|
"num_tokens": 636033.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 2.3254229539632796,
|
|
"epoch": 0.09246417013407304,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 9.364675984752224e-06,
|
|
"loss": 2.4465567016601564,
|
|
"mean_token_accuracy": 0.6496502718515694,
|
|
"num_tokens": 777342.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 2.2117018654197453,
|
|
"epoch": 0.10787486515641856,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 9.20584498094028e-06,
|
|
"loss": 2.288543701171875,
|
|
"mean_token_accuracy": 0.664428948648274,
|
|
"num_tokens": 902453.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 2.174446207880974,
|
|
"epoch": 0.12328556017876406,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 9.047013977128337e-06,
|
|
"loss": 2.327859649658203,
|
|
"mean_token_accuracy": 0.6647384916990995,
|
|
"num_tokens": 1039427.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 2.0399726448208093,
|
|
"epoch": 0.13869625520110956,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 8.888182973316391e-06,
|
|
"loss": 2.156089324951172,
|
|
"mean_token_accuracy": 0.6871329558640719,
|
|
"num_tokens": 1166840.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 2.071619209870696,
|
|
"epoch": 0.15410695022345508,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 8.729351969504447e-06,
|
|
"loss": 2.2405677795410157,
|
|
"mean_token_accuracy": 0.680448934994638,
|
|
"num_tokens": 1300789.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 2.032237692028284,
|
|
"epoch": 0.1695176452458006,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 8.570520965692504e-06,
|
|
"loss": 2.1679531860351564,
|
|
"mean_token_accuracy": 0.6796054230630397,
|
|
"num_tokens": 1432596.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 1.9784896748512983,
|
|
"epoch": 0.18492834026814609,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 8.41168996188056e-06,
|
|
"loss": 2.0713957214355467,
|
|
"mean_token_accuracy": 0.6908648996800184,
|
|
"num_tokens": 1559688.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.9056549924612045,
|
|
"epoch": 0.2003390352904916,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 8.252858958068616e-06,
|
|
"loss": 2.1153318786621096,
|
|
"mean_token_accuracy": 0.7020338359847664,
|
|
"num_tokens": 1688970.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 1.9747070623934269,
|
|
"epoch": 0.21574973031283712,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 8.09402795425667e-06,
|
|
"loss": 2.1253143310546876,
|
|
"mean_token_accuracy": 0.6871407954767347,
|
|
"num_tokens": 1816182.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 2.024077450931072,
|
|
"epoch": 0.2311604253351826,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 7.935196950444729e-06,
|
|
"loss": 2.1842442321777344,
|
|
"mean_token_accuracy": 0.677086523026228,
|
|
"num_tokens": 1950644.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 1.9184869919717311,
|
|
"epoch": 0.24657112035752812,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 7.776365946632783e-06,
|
|
"loss": 2.0085203552246096,
|
|
"mean_token_accuracy": 0.694020996466279,
|
|
"num_tokens": 2079928.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.8541498044878244,
|
|
"epoch": 0.26198181537987364,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 7.617534942820839e-06,
|
|
"loss": 2.0481745910644533,
|
|
"mean_token_accuracy": 0.7019787009432912,
|
|
"num_tokens": 2218227.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 1.9318342459201814,
|
|
"epoch": 0.27739251040221913,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 7.458703939008896e-06,
|
|
"loss": 1.995413818359375,
|
|
"mean_token_accuracy": 0.6925960695371032,
|
|
"num_tokens": 2350679.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.7835957117378711,
|
|
"epoch": 0.29280320542456467,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 7.299872935196951e-06,
|
|
"loss": 1.928093719482422,
|
|
"mean_token_accuracy": 0.7172103912383317,
|
|
"num_tokens": 2472981.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 1.9411964005231857,
|
|
"epoch": 0.30821390044691016,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 7.141041931385007e-06,
|
|
"loss": 2.106062774658203,
|
|
"mean_token_accuracy": 0.6849398523569107,
|
|
"num_tokens": 2614781.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.7948928633891046,
|
|
"epoch": 0.32362459546925565,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 6.982210927573063e-06,
|
|
"loss": 1.9581819152832032,
|
|
"mean_token_accuracy": 0.70925975356251,
|
|
"num_tokens": 2743217.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 1.720666101127863,
|
|
"epoch": 0.3390352904916012,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 6.823379923761118e-06,
|
|
"loss": 1.8939352416992188,
|
|
"mean_token_accuracy": 0.7219540763273835,
|
|
"num_tokens": 2872564.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.765952904894948,
|
|
"epoch": 0.3544459855139467,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 6.6645489199491745e-06,
|
|
"loss": 1.8741084289550782,
|
|
"mean_token_accuracy": 0.7158531962707638,
|
|
"num_tokens": 3003409.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 1.7450720983743668,
|
|
"epoch": 0.36985668053629217,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 6.505717916137231e-06,
|
|
"loss": 1.8735758972167968,
|
|
"mean_token_accuracy": 0.7193968405947089,
|
|
"num_tokens": 3134051.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.7807146763801576,
|
|
"epoch": 0.3852673755586377,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 6.346886912325286e-06,
|
|
"loss": 1.89009521484375,
|
|
"mean_token_accuracy": 0.7120489033311606,
|
|
"num_tokens": 3272289.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 1.688743471726775,
|
|
"epoch": 0.4006780705809832,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 6.188055908513342e-06,
|
|
"loss": 1.8537098693847656,
|
|
"mean_token_accuracy": 0.7283642463758588,
|
|
"num_tokens": 3395473.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.681125262901187,
|
|
"epoch": 0.4160887656033287,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 6.029224904701399e-06,
|
|
"loss": 1.7891111755371094,
|
|
"mean_token_accuracy": 0.7292297334969043,
|
|
"num_tokens": 3524237.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 1.7346787237748504,
|
|
"epoch": 0.43149946062567424,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 5.870393900889454e-06,
|
|
"loss": 1.9180752563476562,
|
|
"mean_token_accuracy": 0.716539504416287,
|
|
"num_tokens": 3660040.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.7547597530111672,
|
|
"epoch": 0.4469101556480197,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 5.71156289707751e-06,
|
|
"loss": 1.8909840393066406,
|
|
"mean_token_accuracy": 0.7133365147560835,
|
|
"num_tokens": 3791154.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 1.792205568253994,
|
|
"epoch": 0.4623208506703652,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 5.552731893265566e-06,
|
|
"loss": 1.93789306640625,
|
|
"mean_token_accuracy": 0.7044468146562576,
|
|
"num_tokens": 3930320.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.801283170208335,
|
|
"epoch": 0.47773154569271076,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 5.393900889453621e-06,
|
|
"loss": 1.971471405029297,
|
|
"mean_token_accuracy": 0.702786465510726,
|
|
"num_tokens": 4061353.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 1.7702496079355479,
|
|
"epoch": 0.49314224071505625,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 5.235069885641678e-06,
|
|
"loss": 1.8937255859375,
|
|
"mean_token_accuracy": 0.7105545987561345,
|
|
"num_tokens": 4188252.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.746090711504221,
|
|
"epoch": 0.5085529357374018,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 5.076238881829734e-06,
|
|
"loss": 1.8904119873046874,
|
|
"mean_token_accuracy": 0.710063861683011,
|
|
"num_tokens": 4319900.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 1.6968115794286132,
|
|
"epoch": 0.5239636307597473,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 4.91740787801779e-06,
|
|
"loss": 1.8843421936035156,
|
|
"mean_token_accuracy": 0.7204603585228324,
|
|
"num_tokens": 4452384.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.7424921029433609,
|
|
"epoch": 0.5393743257820928,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 4.758576874205845e-06,
|
|
"loss": 1.8412220764160157,
|
|
"mean_token_accuracy": 0.7172071708366274,
|
|
"num_tokens": 4583222.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 1.7446832180023193,
|
|
"epoch": 0.5547850208044383,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 4.599745870393902e-06,
|
|
"loss": 1.9152328491210937,
|
|
"mean_token_accuracy": 0.7119575057178735,
|
|
"num_tokens": 4715126.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.7576309859752655,
|
|
"epoch": 0.5701957158267837,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 4.440914866581957e-06,
|
|
"loss": 1.8862844848632812,
|
|
"mean_token_accuracy": 0.7049576634168625,
|
|
"num_tokens": 4850794.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 1.7258573825657368,
|
|
"epoch": 0.5856064108491293,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 4.282083862770013e-06,
|
|
"loss": 1.818639678955078,
|
|
"mean_token_accuracy": 0.716189993545413,
|
|
"num_tokens": 4981104.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.6470270904898643,
|
|
"epoch": 0.6010171058714748,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 4.123252858958069e-06,
|
|
"loss": 1.7824436950683593,
|
|
"mean_token_accuracy": 0.7320361129194498,
|
|
"num_tokens": 5114019.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 1.62755079947412,
|
|
"epoch": 0.6164278008938203,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 3.964421855146125e-06,
|
|
"loss": 1.7079803466796875,
|
|
"mean_token_accuracy": 0.7364445444941521,
|
|
"num_tokens": 5243964.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.7295116788893938,
|
|
"epoch": 0.6318384959161658,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 3.8055908513341803e-06,
|
|
"loss": 1.844159393310547,
|
|
"mean_token_accuracy": 0.712419720813632,
|
|
"num_tokens": 5377738.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 1.6804203514009715,
|
|
"epoch": 0.6472491909385113,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 3.6467598475222366e-06,
|
|
"loss": 1.8214646911621093,
|
|
"mean_token_accuracy": 0.7212847074493766,
|
|
"num_tokens": 5510981.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.5993686743080615,
|
|
"epoch": 0.6626598859608568,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 3.4879288437102924e-06,
|
|
"loss": 1.6954243469238282,
|
|
"mean_token_accuracy": 0.7390070861950516,
|
|
"num_tokens": 5633201.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 1.667136338762939,
|
|
"epoch": 0.6780705809832024,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 3.3290978398983487e-06,
|
|
"loss": 1.7823049926757812,
|
|
"mean_token_accuracy": 0.7207983901910484,
|
|
"num_tokens": 5759874.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.678079522177577,
|
|
"epoch": 0.6934812760055479,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 3.170266836086404e-06,
|
|
"loss": 1.7977125549316406,
|
|
"mean_token_accuracy": 0.7210700345411897,
|
|
"num_tokens": 5883476.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 1.6764129892736674,
|
|
"epoch": 0.7088919710278934,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 3.0114358322744603e-06,
|
|
"loss": 1.8482670593261719,
|
|
"mean_token_accuracy": 0.7182181442528963,
|
|
"num_tokens": 6011086.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.7141736481338739,
|
|
"epoch": 0.7243026660502389,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 2.852604828462516e-06,
|
|
"loss": 1.8182452392578126,
|
|
"mean_token_accuracy": 0.7161370900273323,
|
|
"num_tokens": 6146546.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 1.641965696439147,
|
|
"epoch": 0.7397133610725843,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 2.693773824650572e-06,
|
|
"loss": 1.779376220703125,
|
|
"mean_token_accuracy": 0.7270625644922256,
|
|
"num_tokens": 6281309.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.6535117710381746,
|
|
"epoch": 0.7551240560949298,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 2.534942820838628e-06,
|
|
"loss": 1.762202606201172,
|
|
"mean_token_accuracy": 0.7227025451511144,
|
|
"num_tokens": 6407111.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 1.5914642249792814,
|
|
"epoch": 0.7705347511172754,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 2.376111817026684e-06,
|
|
"loss": 1.7256475830078124,
|
|
"mean_token_accuracy": 0.7357470904290676,
|
|
"num_tokens": 6537257.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.7124161531031132,
|
|
"epoch": 0.7859454461396209,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 2.21728081321474e-06,
|
|
"loss": 1.8616783142089843,
|
|
"mean_token_accuracy": 0.7149319493025541,
|
|
"num_tokens": 6662719.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 1.6728860459476709,
|
|
"epoch": 0.8013561411619664,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 2.0584498094027953e-06,
|
|
"loss": 1.8268055725097656,
|
|
"mean_token_accuracy": 0.7221832738444209,
|
|
"num_tokens": 6798456.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.6639322647452355,
|
|
"epoch": 0.8167668361843119,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 1.8996188055908516e-06,
|
|
"loss": 1.7712481689453126,
|
|
"mean_token_accuracy": 0.72425989869982,
|
|
"num_tokens": 6928111.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 1.6321870504319669,
|
|
"epoch": 0.8321775312066574,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 1.7407878017789074e-06,
|
|
"loss": 1.8233981323242188,
|
|
"mean_token_accuracy": 0.728203468695283,
|
|
"num_tokens": 7062876.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.8206283743306995,
|
|
"epoch": 0.847588226229003,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 1.5819567979669634e-06,
|
|
"loss": 1.9631840515136718,
|
|
"mean_token_accuracy": 0.7005566702410578,
|
|
"num_tokens": 7198189.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 1.6183030263334512,
|
|
"epoch": 0.8629989212513485,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 1.4231257941550193e-06,
|
|
"loss": 1.7469532775878907,
|
|
"mean_token_accuracy": 0.7299554903805255,
|
|
"num_tokens": 7331299.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.6894471324980258,
|
|
"epoch": 0.878409616273694,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 1.2642947903430749e-06,
|
|
"loss": 1.819949951171875,
|
|
"mean_token_accuracy": 0.7225298710912466,
|
|
"num_tokens": 7468273.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 1.6117998372018336,
|
|
"epoch": 0.8938203112960394,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 1.105463786531131e-06,
|
|
"loss": 1.7713958740234375,
|
|
"mean_token_accuracy": 0.7342760527133941,
|
|
"num_tokens": 7598957.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.7378646701574325,
|
|
"epoch": 0.9092310063183849,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 9.466327827191868e-07,
|
|
"loss": 1.8703489685058594,
|
|
"mean_token_accuracy": 0.7118765298649669,
|
|
"num_tokens": 7731099.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 1.6067133033648133,
|
|
"epoch": 0.9246417013407304,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 7.878017789072427e-07,
|
|
"loss": 1.7261012268066407,
|
|
"mean_token_accuracy": 0.7332322986423969,
|
|
"num_tokens": 7858562.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.6211172859743237,
|
|
"epoch": 0.940052396363076,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 6.289707750952986e-07,
|
|
"loss": 1.801383514404297,
|
|
"mean_token_accuracy": 0.7289227614179253,
|
|
"num_tokens": 7990016.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 1.6162557833641769,
|
|
"epoch": 0.9554630913854215,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 4.7013977128335456e-07,
|
|
"loss": 1.7632601928710938,
|
|
"mean_token_accuracy": 0.7340098781138659,
|
|
"num_tokens": 8128392.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.638201398998499,
|
|
"epoch": 0.970873786407767,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 3.1130876747141044e-07,
|
|
"loss": 1.7828684997558595,
|
|
"mean_token_accuracy": 0.7265522088482976,
|
|
"num_tokens": 8255611.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 1.742434518635273,
|
|
"epoch": 0.9862844814301125,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 1.5247776365946635e-07,
|
|
"loss": 1.8525875854492186,
|
|
"mean_token_accuracy": 0.7128269827365875,
|
|
"num_tokens": 8385670.0,
|
|
"step": 1600
|
|
}
|
|
],
|
|
"logging_steps": 25,
|
|
"max_steps": 1623,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.0231669797680909e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|