{ "best_global_step": 1200, "best_metric": 0.43056953, "best_model_checkpoint": "/openpai_config/sft/Long_Cot_data/Stage1-380k-24k-length-Qwen3-8B-Base-resume-iter4600-4p-3e-5/v0-20250826-235423/checkpoint-1200", "epoch": 2.2665292804396358, "eval_steps": 300, "global_step": 3300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006869311351537008, "grad_norm": 0.07274238765239716, "learning_rate": 2.9999997817680842e-05, "loss": 0.2985985279083252, "memory(GiB)": 56.61, "step": 1, "token_acc": 0.8812937590030485, "train_speed(iter/s)": 0.013692 }, { "epoch": 0.013738622703074016, "grad_norm": 0.10364966094493866, "learning_rate": 2.9999127080781484e-05, "loss": 0.32278080990439967, "memory(GiB)": 70.37, "step": 20, "token_acc": 0.8852460572314409, "train_speed(iter/s)": 0.033578 }, { "epoch": 0.027477245406148033, "grad_norm": 0.0828627347946167, "learning_rate": 2.999650842472434e-05, "loss": 0.3287534713745117, "memory(GiB)": 70.37, "step": 40, "token_acc": 0.8729899276605303, "train_speed(iter/s)": 0.035269 }, { "epoch": 0.04121586810922205, "grad_norm": 0.075782909989357, "learning_rate": 2.9992144336611927e-05, "loss": 0.32648520469665526, "memory(GiB)": 70.37, "step": 60, "token_acc": 0.871392951813677, "train_speed(iter/s)": 0.03605 }, { "epoch": 0.054954490812296065, "grad_norm": 0.09333578497171402, "learning_rate": 2.998603532437709e-05, "loss": 0.3258840799331665, "memory(GiB)": 70.37, "step": 80, "token_acc": 0.8820731351865331, "train_speed(iter/s)": 0.036483 }, { "epoch": 0.06869311351537008, "grad_norm": 0.08618636429309845, "learning_rate": 2.9978182099043062e-05, "loss": 0.3262542724609375, "memory(GiB)": 70.37, "step": 100, "token_acc": 0.8849732393640833, "train_speed(iter/s)": 0.036776 }, { "epoch": 0.0824317362184441, "grad_norm": 0.08525851368904114, "learning_rate": 2.9968585574640675e-05, "loss": 0.32371375560760496, "memory(GiB)": 70.37, "step": 120, "token_acc": 0.8871000703215756, "train_speed(iter/s)": 0.036983 }, { "epoch": 0.09617035892151812, "grad_norm": 0.08820851147174835, "learning_rate": 2.995724686810202e-05, "loss": 0.3224189281463623, "memory(GiB)": 70.37, "step": 140, "token_acc": 0.8856544898235427, "train_speed(iter/s)": 0.03715 }, { "epoch": 0.10990898162459213, "grad_norm": 0.0837075412273407, "learning_rate": 2.9944167299130397e-05, "loss": 0.32613368034362794, "memory(GiB)": 70.4, "step": 160, "token_acc": 0.8848279812123834, "train_speed(iter/s)": 0.037079 }, { "epoch": 0.12364760432766615, "grad_norm": 0.09300094842910767, "learning_rate": 2.9929348390046766e-05, "loss": 0.32998530864715575, "memory(GiB)": 70.4, "step": 180, "token_acc": 0.8633912245766957, "train_speed(iter/s)": 0.037181 }, { "epoch": 0.13738622703074016, "grad_norm": 0.08325231075286865, "learning_rate": 2.9912791865612525e-05, "loss": 0.3349442958831787, "memory(GiB)": 70.4, "step": 200, "token_acc": 0.8835910835510771, "train_speed(iter/s)": 0.037269 }, { "epoch": 0.15112484973381418, "grad_norm": 0.08610066026449203, "learning_rate": 2.9894499652828798e-05, "loss": 0.33656883239746094, "memory(GiB)": 70.4, "step": 220, "token_acc": 0.8754293623532549, "train_speed(iter/s)": 0.037356 }, { "epoch": 0.1648634724368882, "grad_norm": 0.09172473102807999, "learning_rate": 2.9874473880712125e-05, "loss": 0.3357390403747559, "memory(GiB)": 70.4, "step": 240, "token_acc": 0.8816886144038251, "train_speed(iter/s)": 0.037427 }, { "epoch": 0.17860209513996222, "grad_norm": 0.08784262090921402, "learning_rate": 2.9852716880046687e-05, "loss": 0.33715412616729734, "memory(GiB)": 70.4, "step": 260, "token_acc": 0.8768648698038725, "train_speed(iter/s)": 0.037172 }, { "epoch": 0.19234071784303625, "grad_norm": 0.08109364658594131, "learning_rate": 2.9829231183113013e-05, "loss": 0.3330291509628296, "memory(GiB)": 70.4, "step": 280, "token_acc": 0.884993071444176, "train_speed(iter/s)": 0.037251 }, { "epoch": 0.20607934054611024, "grad_norm": 0.08930086344480515, "learning_rate": 2.980401952339328e-05, "loss": 0.32976431846618653, "memory(GiB)": 70.4, "step": 300, "token_acc": 0.8701908549811445, "train_speed(iter/s)": 0.037312 }, { "epoch": 0.20607934054611024, "eval_loss": 0.4335128664970398, "eval_runtime": 97.7946, "eval_samples_per_second": 38.489, "eval_steps_per_second": 0.603, "eval_token_acc": 0.8434016162256411, "step": 300 }, { "epoch": 0.21981796324918426, "grad_norm": 0.08906491845846176, "learning_rate": 2.9777084835253107e-05, "loss": 0.3341225624084473, "memory(GiB)": 72.41, "step": 320, "token_acc": 0.8633943248975853, "train_speed(iter/s)": 0.036714 }, { "epoch": 0.23355658595225828, "grad_norm": 0.09569748491048813, "learning_rate": 2.9748430253600103e-05, "loss": 0.3317814826965332, "memory(GiB)": 72.41, "step": 340, "token_acc": 0.8690857078605088, "train_speed(iter/s)": 0.036742 }, { "epoch": 0.2472952086553323, "grad_norm": 0.09385869652032852, "learning_rate": 2.9718059113518926e-05, "loss": 0.33147258758544923, "memory(GiB)": 72.41, "step": 360, "token_acc": 0.8744091464585358, "train_speed(iter/s)": 0.036789 }, { "epoch": 0.2610338313584063, "grad_norm": 0.09747012704610825, "learning_rate": 2.9685974949883163e-05, "loss": 0.3316455841064453, "memory(GiB)": 72.41, "step": 380, "token_acc": 0.8744771091993391, "train_speed(iter/s)": 0.036834 }, { "epoch": 0.2747724540614803, "grad_norm": 0.0942377969622612, "learning_rate": 2.9652181496943888e-05, "loss": 0.33130803108215334, "memory(GiB)": 72.41, "step": 400, "token_acc": 0.8743932496768547, "train_speed(iter/s)": 0.036889 }, { "epoch": 0.28851107676455434, "grad_norm": 0.08894450962543488, "learning_rate": 2.9616682687895038e-05, "loss": 0.3286163806915283, "memory(GiB)": 72.41, "step": 420, "token_acc": 0.8871290646641795, "train_speed(iter/s)": 0.036939 }, { "epoch": 0.30224969946762836, "grad_norm": 0.09808226674795151, "learning_rate": 2.9579482654415627e-05, "loss": 0.3300768375396729, "memory(GiB)": 72.41, "step": 440, "token_acc": 0.8833857380634584, "train_speed(iter/s)": 0.036991 }, { "epoch": 0.3159883221707024, "grad_norm": 0.09648197889328003, "learning_rate": 2.9540585726188883e-05, "loss": 0.3316764831542969, "memory(GiB)": 72.41, "step": 460, "token_acc": 0.8862556346113715, "train_speed(iter/s)": 0.037032 }, { "epoch": 0.3297269448737764, "grad_norm": 0.08601760119199753, "learning_rate": 2.9499996430398296e-05, "loss": 0.3263013124465942, "memory(GiB)": 72.41, "step": 480, "token_acc": 0.8804854791092939, "train_speed(iter/s)": 0.037074 }, { "epoch": 0.34346556757685043, "grad_norm": 0.08579235523939133, "learning_rate": 2.945771949120071e-05, "loss": 0.33027398586273193, "memory(GiB)": 72.41, "step": 500, "token_acc": 0.8894077316408038, "train_speed(iter/s)": 0.037103 }, { "epoch": 0.35720419027992445, "grad_norm": 0.08523233234882355, "learning_rate": 2.9413759829176497e-05, "loss": 0.32302305698394773, "memory(GiB)": 72.41, "step": 520, "token_acc": 0.8851045135756845, "train_speed(iter/s)": 0.037131 }, { "epoch": 0.37094281298299847, "grad_norm": 0.08190900087356567, "learning_rate": 2.9368122560756822e-05, "loss": 0.3292397975921631, "memory(GiB)": 72.41, "step": 540, "token_acc": 0.8731282020961862, "train_speed(iter/s)": 0.037164 }, { "epoch": 0.3846814356860725, "grad_norm": 0.0842556357383728, "learning_rate": 2.9320812997628184e-05, "loss": 0.3262279748916626, "memory(GiB)": 72.41, "step": 560, "token_acc": 0.8791384953802229, "train_speed(iter/s)": 0.037189 }, { "epoch": 0.3984200583891465, "grad_norm": 0.09352608770132065, "learning_rate": 2.9271836646114166e-05, "loss": 0.3300283908843994, "memory(GiB)": 72.41, "step": 580, "token_acc": 0.8848605133294101, "train_speed(iter/s)": 0.037218 }, { "epoch": 0.4121586810922205, "grad_norm": 0.08545698970556259, "learning_rate": 2.922119920653457e-05, "loss": 0.33294997215270994, "memory(GiB)": 72.41, "step": 600, "token_acc": 0.8852764804825887, "train_speed(iter/s)": 0.037244 }, { "epoch": 0.4121586810922205, "eval_loss": 0.43278947472572327, "eval_runtime": 98.2411, "eval_samples_per_second": 38.314, "eval_steps_per_second": 0.601, "eval_token_acc": 0.8435347324817475, "step": 600 }, { "epoch": 0.4258973037952945, "grad_norm": 0.09264256060123444, "learning_rate": 2.916890657254194e-05, "loss": 0.32485041618347166, "memory(GiB)": 72.41, "step": 620, "token_acc": 0.8645305113582833, "train_speed(iter/s)": 0.036936 }, { "epoch": 0.4396359264983685, "grad_norm": 0.08746380358934402, "learning_rate": 2.9114964830435648e-05, "loss": 0.32976808547973635, "memory(GiB)": 72.41, "step": 640, "token_acc": 0.8707312036033282, "train_speed(iter/s)": 0.036935 }, { "epoch": 0.45337454920144254, "grad_norm": 0.08795125037431717, "learning_rate": 2.9059380258453473e-05, "loss": 0.3318798303604126, "memory(GiB)": 72.41, "step": 660, "token_acc": 0.8750859732168144, "train_speed(iter/s)": 0.03695 }, { "epoch": 0.46711317190451657, "grad_norm": 0.09344589710235596, "learning_rate": 2.9002159326040897e-05, "loss": 0.3324501752853394, "memory(GiB)": 72.41, "step": 680, "token_acc": 0.8820933572199229, "train_speed(iter/s)": 0.036974 }, { "epoch": 0.4808517946075906, "grad_norm": 0.08175963908433914, "learning_rate": 2.894330869309814e-05, "loss": 0.3342601776123047, "memory(GiB)": 72.41, "step": 700, "token_acc": 0.888566898590454, "train_speed(iter/s)": 0.037001 }, { "epoch": 0.4945904173106646, "grad_norm": 0.08012371510267258, "learning_rate": 2.8882835209205e-05, "loss": 0.3325735807418823, "memory(GiB)": 72.41, "step": 720, "token_acc": 0.8726016398435306, "train_speed(iter/s)": 0.03703 }, { "epoch": 0.5083290400137386, "grad_norm": 0.08838852494955063, "learning_rate": 2.8820745912823653e-05, "loss": 0.3366635799407959, "memory(GiB)": 72.41, "step": 740, "token_acc": 0.8669464276591397, "train_speed(iter/s)": 0.037058 }, { "epoch": 0.5220676627168126, "grad_norm": 0.08403000980615616, "learning_rate": 2.8757048030479438e-05, "loss": 0.33883938789367674, "memory(GiB)": 72.41, "step": 760, "token_acc": 0.8731299584725523, "train_speed(iter/s)": 0.037083 }, { "epoch": 0.5358062854198866, "grad_norm": 0.09687156975269318, "learning_rate": 2.8691748975919784e-05, "loss": 0.3338437557220459, "memory(GiB)": 72.41, "step": 780, "token_acc": 0.8848287485135103, "train_speed(iter/s)": 0.037115 }, { "epoch": 0.5495449081229606, "grad_norm": 0.08311958611011505, "learning_rate": 2.862485634925131e-05, "loss": 0.3358239889144897, "memory(GiB)": 72.41, "step": 800, "token_acc": 0.8782527593369446, "train_speed(iter/s)": 0.037145 }, { "epoch": 0.5632835308260347, "grad_norm": 0.08971302956342697, "learning_rate": 2.855637793605527e-05, "loss": 0.3317149877548218, "memory(GiB)": 72.41, "step": 820, "token_acc": 0.8807387938176727, "train_speed(iter/s)": 0.037168 }, { "epoch": 0.5770221535291087, "grad_norm": 0.08680903911590576, "learning_rate": 2.848632170648139e-05, "loss": 0.3325679779052734, "memory(GiB)": 72.41, "step": 840, "token_acc": 0.8763491204567425, "train_speed(iter/s)": 0.037194 }, { "epoch": 0.5907607762321827, "grad_norm": 0.08324452489614487, "learning_rate": 2.8414695814320224e-05, "loss": 0.3364755868911743, "memory(GiB)": 72.41, "step": 860, "token_acc": 0.8762890432412627, "train_speed(iter/s)": 0.037217 }, { "epoch": 0.6044993989352567, "grad_norm": 0.08400746434926987, "learning_rate": 2.834150859605415e-05, "loss": 0.3361694812774658, "memory(GiB)": 72.41, "step": 880, "token_acc": 0.8813631606405008, "train_speed(iter/s)": 0.037241 }, { "epoch": 0.6182380216383307, "grad_norm": 0.08885052800178528, "learning_rate": 2.8266768569887078e-05, "loss": 0.33670692443847655, "memory(GiB)": 72.41, "step": 900, "token_acc": 0.8707416462917685, "train_speed(iter/s)": 0.037262 }, { "epoch": 0.6182380216383307, "eval_loss": 0.43130558729171753, "eval_runtime": 98.6067, "eval_samples_per_second": 38.172, "eval_steps_per_second": 0.598, "eval_token_acc": 0.8438052359165757, "step": 900 }, { "epoch": 0.6319766443414048, "grad_norm": 0.08866509050130844, "learning_rate": 2.8190484434753047e-05, "loss": 0.3301401615142822, "memory(GiB)": 72.41, "step": 920, "token_acc": 0.8694062120826557, "train_speed(iter/s)": 0.037051 }, { "epoch": 0.6457152670444788, "grad_norm": 0.0904315784573555, "learning_rate": 2.811266506930373e-05, "loss": 0.3325372219085693, "memory(GiB)": 72.41, "step": 940, "token_acc": 0.8789977772514003, "train_speed(iter/s)": 0.037059 }, { "epoch": 0.6594538897475528, "grad_norm": 0.08624011278152466, "learning_rate": 2.80333195308751e-05, "loss": 0.33223259449005127, "memory(GiB)": 72.41, "step": 960, "token_acc": 0.8850906146562588, "train_speed(iter/s)": 0.037068 }, { "epoch": 0.6731925124506268, "grad_norm": 0.09679839015007019, "learning_rate": 2.7952457054433193e-05, "loss": 0.3346273183822632, "memory(GiB)": 72.41, "step": 980, "token_acc": 0.8790110005974765, "train_speed(iter/s)": 0.037086 }, { "epoch": 0.6869311351537009, "grad_norm": 0.08788934350013733, "learning_rate": 2.787008705149932e-05, "loss": 0.331668758392334, "memory(GiB)": 72.41, "step": 1000, "token_acc": 0.8727144921802968, "train_speed(iter/s)": 0.037101 }, { "epoch": 0.7006697578567749, "grad_norm": 0.07837922871112823, "learning_rate": 2.7786219109054618e-05, "loss": 0.33264620304107667, "memory(GiB)": 72.41, "step": 1020, "token_acc": 0.8763268597016804, "train_speed(iter/s)": 0.037116 }, { "epoch": 0.7144083805598489, "grad_norm": 0.08805106580257416, "learning_rate": 2.770086298842426e-05, "loss": 0.33027656078338624, "memory(GiB)": 72.41, "step": 1040, "token_acc": 0.8825189704888438, "train_speed(iter/s)": 0.037133 }, { "epoch": 0.7281470032629229, "grad_norm": 0.07906992733478546, "learning_rate": 2.7614028624141333e-05, "loss": 0.3281256914138794, "memory(GiB)": 72.41, "step": 1060, "token_acc": 0.8848472346715264, "train_speed(iter/s)": 0.037145 }, { "epoch": 0.7418856259659969, "grad_norm": 0.08966130018234253, "learning_rate": 2.7525726122790556e-05, "loss": 0.33036127090454104, "memory(GiB)": 72.41, "step": 1080, "token_acc": 0.8711485117219462, "train_speed(iter/s)": 0.037158 }, { "epoch": 0.755624248669071, "grad_norm": 0.0860779657959938, "learning_rate": 2.7435965761831987e-05, "loss": 0.32908782958984373, "memory(GiB)": 72.41, "step": 1100, "token_acc": 0.868969553437273, "train_speed(iter/s)": 0.037175 }, { "epoch": 0.769362871372145, "grad_norm": 0.08935214579105377, "learning_rate": 2.7344757988404845e-05, "loss": 0.33276095390319826, "memory(GiB)": 72.41, "step": 1120, "token_acc": 0.8682377205407336, "train_speed(iter/s)": 0.037189 }, { "epoch": 0.783101494075219, "grad_norm": 0.08778548985719681, "learning_rate": 2.725211341811158e-05, "loss": 0.33044397830963135, "memory(GiB)": 72.41, "step": 1140, "token_acc": 0.8793117868359244, "train_speed(iter/s)": 0.037201 }, { "epoch": 0.796840116778293, "grad_norm": 0.09090530127286911, "learning_rate": 2.71580428337823e-05, "loss": 0.32858192920684814, "memory(GiB)": 72.41, "step": 1160, "token_acc": 0.8733004247503157, "train_speed(iter/s)": 0.037216 }, { "epoch": 0.8105787394813669, "grad_norm": 0.08553975820541382, "learning_rate": 2.7062557184219806e-05, "loss": 0.3291203498840332, "memory(GiB)": 72.41, "step": 1180, "token_acc": 0.8764492301755195, "train_speed(iter/s)": 0.037231 }, { "epoch": 0.824317362184441, "grad_norm": 0.08339999616146088, "learning_rate": 2.6965667582925247e-05, "loss": 0.3333151817321777, "memory(GiB)": 72.41, "step": 1200, "token_acc": 0.8695492444572112, "train_speed(iter/s)": 0.037244 }, { "epoch": 0.824317362184441, "eval_loss": 0.43056952953338623, "eval_runtime": 99.0376, "eval_samples_per_second": 38.006, "eval_steps_per_second": 0.596, "eval_token_acc": 0.8441900740502716, "step": 1200 }, { "epoch": 0.838055984887515, "grad_norm": 0.08720073848962784, "learning_rate": 2.686738530680462e-05, "loss": 0.33159494400024414, "memory(GiB)": 72.41, "step": 1220, "token_acc": 0.8612746275278335, "train_speed(iter/s)": 0.037076 }, { "epoch": 0.851794607590589, "grad_norm": 0.08212270587682724, "learning_rate": 2.676772179485629e-05, "loss": 0.3343451976776123, "memory(GiB)": 72.41, "step": 1240, "token_acc": 0.874983845473253, "train_speed(iter/s)": 0.037077 }, { "epoch": 0.865533230293663, "grad_norm": 0.08813036233186722, "learning_rate": 2.6666688646839574e-05, "loss": 0.3311768531799316, "memory(GiB)": 72.41, "step": 1260, "token_acc": 0.8867381088510741, "train_speed(iter/s)": 0.037081 }, { "epoch": 0.879271852996737, "grad_norm": 0.08220379054546356, "learning_rate": 2.6564297621924696e-05, "loss": 0.33231358528137206, "memory(GiB)": 72.41, "step": 1280, "token_acc": 0.8829307086452494, "train_speed(iter/s)": 0.037092 }, { "epoch": 0.8930104756998111, "grad_norm": 0.08680058270692825, "learning_rate": 2.6460560637324113e-05, "loss": 0.3345161199569702, "memory(GiB)": 72.41, "step": 1300, "token_acc": 0.8796637788480545, "train_speed(iter/s)": 0.037101 }, { "epoch": 0.9067490984028851, "grad_norm": 0.09523913264274597, "learning_rate": 2.6355489766905496e-05, "loss": 0.33291900157928467, "memory(GiB)": 72.41, "step": 1320, "token_acc": 0.8843804465166379, "train_speed(iter/s)": 0.037112 }, { "epoch": 0.9204877211059591, "grad_norm": 0.08730066567659378, "learning_rate": 2.6249097239786456e-05, "loss": 0.33270628452301027, "memory(GiB)": 72.41, "step": 1340, "token_acc": 0.8798414921651669, "train_speed(iter/s)": 0.037122 }, { "epoch": 0.9342263438090331, "grad_norm": 0.0898577868938446, "learning_rate": 2.6141395438911216e-05, "loss": 0.3346142530441284, "memory(GiB)": 72.41, "step": 1360, "token_acc": 0.8822741759710485, "train_speed(iter/s)": 0.037134 }, { "epoch": 0.9479649665121072, "grad_norm": 0.08267758786678314, "learning_rate": 2.603239689960935e-05, "loss": 0.33362205028533937, "memory(GiB)": 72.41, "step": 1380, "token_acc": 0.875556680651538, "train_speed(iter/s)": 0.037147 }, { "epoch": 0.9617035892151812, "grad_norm": 0.08812654763460159, "learning_rate": 2.5922114308136826e-05, "loss": 0.3352126359939575, "memory(GiB)": 72.41, "step": 1400, "token_acc": 0.8624814158268795, "train_speed(iter/s)": 0.037157 }, { "epoch": 0.9754422119182552, "grad_norm": 0.08206140995025635, "learning_rate": 2.5810560500199454e-05, "loss": 0.32973828315734866, "memory(GiB)": 72.41, "step": 1420, "token_acc": 0.8715281106649746, "train_speed(iter/s)": 0.037167 }, { "epoch": 0.9891808346213292, "grad_norm": 0.08127112686634064, "learning_rate": 2.5697748459458945e-05, "loss": 0.33533248901367185, "memory(GiB)": 72.41, "step": 1440, "token_acc": 0.8709140359593514, "train_speed(iter/s)": 0.037179 }, { "epoch": 1.002747724540615, "grad_norm": 0.13326233625411987, "learning_rate": 2.5583691316021758e-05, "loss": 0.32816076278686523, "memory(GiB)": 72.41, "step": 1460, "token_acc": 0.8743706550630328, "train_speed(iter/s)": 0.037189 }, { "epoch": 1.016486347243689, "grad_norm": 0.09180466085672379, "learning_rate": 2.5468402344910895e-05, "loss": 0.30609779357910155, "memory(GiB)": 72.41, "step": 1480, "token_acc": 0.890758443681086, "train_speed(iter/s)": 0.037196 }, { "epoch": 1.030224969946763, "grad_norm": 0.09560491889715195, "learning_rate": 2.5351894964520832e-05, "loss": 0.3120020627975464, "memory(GiB)": 72.41, "step": 1500, "token_acc": 0.8784849657881351, "train_speed(iter/s)": 0.037203 }, { "epoch": 1.030224969946763, "eval_loss": 0.43459564447402954, "eval_runtime": 99.9824, "eval_samples_per_second": 37.647, "eval_steps_per_second": 0.59, "eval_token_acc": 0.8433591170448618, "step": 1500 }, { "epoch": 1.043963592649837, "grad_norm": 0.08771445602178574, "learning_rate": 2.523418273505576e-05, "loss": 0.31691765785217285, "memory(GiB)": 72.41, "step": 1520, "token_acc": 0.8715976442080544, "train_speed(iter/s)": 0.037064 }, { "epoch": 1.057702215352911, "grad_norm": 0.07920734584331512, "learning_rate": 2.511527935695133e-05, "loss": 0.31117587089538573, "memory(GiB)": 72.41, "step": 1540, "token_acc": 0.891813564000213, "train_speed(iter/s)": 0.037065 }, { "epoch": 1.071440838055985, "grad_norm": 0.08091707527637482, "learning_rate": 2.499519866928006e-05, "loss": 0.3078420639038086, "memory(GiB)": 72.41, "step": 1560, "token_acc": 0.8856428771072653, "train_speed(iter/s)": 0.037064 }, { "epoch": 1.0851794607590588, "grad_norm": 0.08118196576833725, "learning_rate": 2.487395464814062e-05, "loss": 0.30695157051086425, "memory(GiB)": 72.41, "step": 1580, "token_acc": 0.896800843691485, "train_speed(iter/s)": 0.037065 }, { "epoch": 1.0989180834621328, "grad_norm": 0.08192740380764008, "learning_rate": 2.475156140503116e-05, "loss": 0.30917532444000245, "memory(GiB)": 72.41, "step": 1600, "token_acc": 0.8825165654283423, "train_speed(iter/s)": 0.037072 }, { "epoch": 1.1126567061652068, "grad_norm": 0.08665605634450912, "learning_rate": 2.4628033185206914e-05, "loss": 0.3106253147125244, "memory(GiB)": 72.41, "step": 1620, "token_acc": 0.885648658540145, "train_speed(iter/s)": 0.037078 }, { "epoch": 1.1263953288682809, "grad_norm": 0.08092272281646729, "learning_rate": 2.4503384366022153e-05, "loss": 0.3136306285858154, "memory(GiB)": 72.41, "step": 1640, "token_acc": 0.8739959227894275, "train_speed(iter/s)": 0.037085 }, { "epoch": 1.1401339515713549, "grad_norm": 0.08443531394004822, "learning_rate": 2.437762945525686e-05, "loss": 0.317700719833374, "memory(GiB)": 72.41, "step": 1660, "token_acc": 0.8844603109257694, "train_speed(iter/s)": 0.037093 }, { "epoch": 1.153872574274429, "grad_norm": 0.0871417298913002, "learning_rate": 2.425078308942815e-05, "loss": 0.3168033123016357, "memory(GiB)": 72.41, "step": 1680, "token_acc": 0.8770761500087126, "train_speed(iter/s)": 0.0371 }, { "epoch": 1.167611196977503, "grad_norm": 0.0831030011177063, "learning_rate": 2.4122860032086763e-05, "loss": 0.31378917694091796, "memory(GiB)": 72.41, "step": 1700, "token_acc": 0.8724602675981399, "train_speed(iter/s)": 0.037109 }, { "epoch": 1.181349819680577, "grad_norm": 0.08853127807378769, "learning_rate": 2.3993875172098737e-05, "loss": 0.31873183250427245, "memory(GiB)": 72.41, "step": 1720, "token_acc": 0.887010406125234, "train_speed(iter/s)": 0.037115 }, { "epoch": 1.195088442383651, "grad_norm": 0.08741605281829834, "learning_rate": 2.3863843521912497e-05, "loss": 0.31804475784301756, "memory(GiB)": 72.41, "step": 1740, "token_acc": 0.8780459946059642, "train_speed(iter/s)": 0.037121 }, { "epoch": 1.208827065086725, "grad_norm": 0.0871034637093544, "learning_rate": 2.3732780215811563e-05, "loss": 0.31754317283630373, "memory(GiB)": 72.41, "step": 1760, "token_acc": 0.8814304365386241, "train_speed(iter/s)": 0.03713 }, { "epoch": 1.222565687789799, "grad_norm": 0.08191618323326111, "learning_rate": 2.3600700508153103e-05, "loss": 0.31642465591430663, "memory(GiB)": 72.41, "step": 1780, "token_acc": 0.8742795205201467, "train_speed(iter/s)": 0.037136 }, { "epoch": 1.236304310492873, "grad_norm": 0.0853760614991188, "learning_rate": 2.346761977159248e-05, "loss": 0.31632657051086427, "memory(GiB)": 72.41, "step": 1800, "token_acc": 0.8847685541277756, "train_speed(iter/s)": 0.037145 }, { "epoch": 1.236304310492873, "eval_loss": 0.4353775978088379, "eval_runtime": 98.8783, "eval_samples_per_second": 38.067, "eval_steps_per_second": 0.597, "eval_token_acc": 0.8432962434627534, "step": 1800 }, { "epoch": 1.250042933195947, "grad_norm": 0.08532160520553589, "learning_rate": 2.3333553495294033e-05, "loss": 0.3085492610931396, "memory(GiB)": 72.41, "step": 1820, "token_acc": 0.8650615076497046, "train_speed(iter/s)": 0.037034 }, { "epoch": 1.263781555899021, "grad_norm": 0.09372863918542862, "learning_rate": 2.3198517283128316e-05, "loss": 0.314247727394104, "memory(GiB)": 72.41, "step": 1840, "token_acc": 0.8870913422078638, "train_speed(iter/s)": 0.037033 }, { "epoch": 1.277520178602095, "grad_norm": 0.0852976143360138, "learning_rate": 2.3062526851855962e-05, "loss": 0.31310009956359863, "memory(GiB)": 72.41, "step": 1860, "token_acc": 0.8906191502723332, "train_speed(iter/s)": 0.037035 }, { "epoch": 1.2912588013051691, "grad_norm": 0.07643554359674454, "learning_rate": 2.2925598029298437e-05, "loss": 0.3103055715560913, "memory(GiB)": 72.41, "step": 1880, "token_acc": 0.887677412229967, "train_speed(iter/s)": 0.037042 }, { "epoch": 1.3049974240082431, "grad_norm": 0.08572836965322495, "learning_rate": 2.278774675249585e-05, "loss": 0.31417174339294435, "memory(GiB)": 72.41, "step": 1900, "token_acc": 0.8894350828946791, "train_speed(iter/s)": 0.037046 }, { "epoch": 1.3187360467113172, "grad_norm": 0.08305912464857101, "learning_rate": 2.264898906585204e-05, "loss": 0.31093263626098633, "memory(GiB)": 72.41, "step": 1920, "token_acc": 0.8820933517164594, "train_speed(iter/s)": 0.037052 }, { "epoch": 1.3324746694143912, "grad_norm": 0.08301204442977905, "learning_rate": 2.2509341119267193e-05, "loss": 0.3095247268676758, "memory(GiB)": 72.41, "step": 1940, "token_acc": 0.8671473791714781, "train_speed(iter/s)": 0.037058 }, { "epoch": 1.3462132921174652, "grad_norm": 0.0815897062420845, "learning_rate": 2.236881916625816e-05, "loss": 0.3098980188369751, "memory(GiB)": 72.41, "step": 1960, "token_acc": 0.8935473891956769, "train_speed(iter/s)": 0.037064 }, { "epoch": 1.3599519148205392, "grad_norm": 0.08821182698011398, "learning_rate": 2.2227439562066734e-05, "loss": 0.30906736850738525, "memory(GiB)": 72.41, "step": 1980, "token_acc": 0.8789100589878682, "train_speed(iter/s)": 0.037069 }, { "epoch": 1.3736905375236133, "grad_norm": 0.0886450707912445, "learning_rate": 2.2085218761756058e-05, "loss": 0.3117701768875122, "memory(GiB)": 72.41, "step": 2000, "token_acc": 0.8915136412607484, "train_speed(iter/s)": 0.037071 }, { "epoch": 1.3874291602266873, "grad_norm": 0.08608590811491013, "learning_rate": 2.1942173318295443e-05, "loss": 0.3138264179229736, "memory(GiB)": 72.41, "step": 2020, "token_acc": 0.8859320703790349, "train_speed(iter/s)": 0.037077 }, { "epoch": 1.4011677829297613, "grad_norm": 0.07869933545589447, "learning_rate": 2.1798319880633795e-05, "loss": 0.3135652542114258, "memory(GiB)": 72.41, "step": 2040, "token_acc": 0.8910194771797223, "train_speed(iter/s)": 0.037082 }, { "epoch": 1.4149064056328353, "grad_norm": 0.07948032766580582, "learning_rate": 2.165367519176183e-05, "loss": 0.3114771842956543, "memory(GiB)": 72.41, "step": 2060, "token_acc": 0.888334672346102, "train_speed(iter/s)": 0.037086 }, { "epoch": 1.4286450283359093, "grad_norm": 0.08114151656627655, "learning_rate": 2.1508256086763372e-05, "loss": 0.3094203948974609, "memory(GiB)": 72.41, "step": 2080, "token_acc": 0.8877183536236418, "train_speed(iter/s)": 0.03709 }, { "epoch": 1.4423836510389834, "grad_norm": 0.0871092826128006, "learning_rate": 2.1362079490855968e-05, "loss": 0.3111464738845825, "memory(GiB)": 72.41, "step": 2100, "token_acc": 0.881144622390869, "train_speed(iter/s)": 0.037093 }, { "epoch": 1.4423836510389834, "eval_loss": 0.4344118535518646, "eval_runtime": 99.5043, "eval_samples_per_second": 37.828, "eval_steps_per_second": 0.593, "eval_token_acc": 0.8434381641208087, "step": 2100 }, { "epoch": 1.4561222737420574, "grad_norm": 0.08234430849552155, "learning_rate": 2.1215162417420926e-05, "loss": 0.3089058637619019, "memory(GiB)": 72.41, "step": 2120, "token_acc": 0.8675401686436827, "train_speed(iter/s)": 0.036998 }, { "epoch": 1.4698608964451314, "grad_norm": 0.08042703568935394, "learning_rate": 2.1067521966023165e-05, "loss": 0.31057741641998293, "memory(GiB)": 72.41, "step": 2140, "token_acc": 0.8962609916378795, "train_speed(iter/s)": 0.036997 }, { "epoch": 1.4835995191482054, "grad_norm": 0.08385493606328964, "learning_rate": 2.0919175320421023e-05, "loss": 0.3134245634078979, "memory(GiB)": 72.41, "step": 2160, "token_acc": 0.8868531518893562, "train_speed(iter/s)": 0.036998 }, { "epoch": 1.4973381418512794, "grad_norm": 0.09416038542985916, "learning_rate": 2.0770139746566223e-05, "loss": 0.31356468200683596, "memory(GiB)": 72.41, "step": 2180, "token_acc": 0.8753653697079176, "train_speed(iter/s)": 0.037001 }, { "epoch": 1.5110767645543535, "grad_norm": 0.08807655423879623, "learning_rate": 2.062043259059432e-05, "loss": 0.31597309112548827, "memory(GiB)": 72.41, "step": 2200, "token_acc": 0.8919064810265528, "train_speed(iter/s)": 0.037007 }, { "epoch": 1.5248153872574275, "grad_norm": 0.08815551549196243, "learning_rate": 2.047007127680579e-05, "loss": 0.3196309804916382, "memory(GiB)": 72.41, "step": 2220, "token_acc": 0.8772186268233043, "train_speed(iter/s)": 0.037012 }, { "epoch": 1.5385540099605015, "grad_norm": 0.08227042853832245, "learning_rate": 2.0319073305638035e-05, "loss": 0.31729488372802733, "memory(GiB)": 72.41, "step": 2240, "token_acc": 0.8858860714860183, "train_speed(iter/s)": 0.037014 }, { "epoch": 1.5522926326635753, "grad_norm": 0.08253244310617447, "learning_rate": 2.0167456251628524e-05, "loss": 0.31553847789764405, "memory(GiB)": 72.41, "step": 2260, "token_acc": 0.8908496364853541, "train_speed(iter/s)": 0.037016 }, { "epoch": 1.5660312553666493, "grad_norm": 0.08127789944410324, "learning_rate": 2.00152377613693e-05, "loss": 0.3174169063568115, "memory(GiB)": 72.41, "step": 2280, "token_acc": 0.8759036896828214, "train_speed(iter/s)": 0.037021 }, { "epoch": 1.5797698780697234, "grad_norm": 0.08351726084947586, "learning_rate": 1.9862435551453103e-05, "loss": 0.31812009811401365, "memory(GiB)": 72.41, "step": 2300, "token_acc": 0.8801280981073656, "train_speed(iter/s)": 0.037028 }, { "epoch": 1.5935085007727974, "grad_norm": 0.08042768388986588, "learning_rate": 1.9709067406411352e-05, "loss": 0.3188045024871826, "memory(GiB)": 72.41, "step": 2320, "token_acc": 0.8883485418399553, "train_speed(iter/s)": 0.037034 }, { "epoch": 1.6072471234758714, "grad_norm": 0.0847587063908577, "learning_rate": 1.9555151176644223e-05, "loss": 0.31552605628967284, "memory(GiB)": 72.41, "step": 2340, "token_acc": 0.8933710959011879, "train_speed(iter/s)": 0.03704 }, { "epoch": 1.6209857461789454, "grad_norm": 0.0849500447511673, "learning_rate": 1.9400704776343047e-05, "loss": 0.3190001010894775, "memory(GiB)": 72.41, "step": 2360, "token_acc": 0.8655127619672538, "train_speed(iter/s)": 0.037046 }, { "epoch": 1.6347243688820194, "grad_norm": 0.0820319652557373, "learning_rate": 1.9245746181405306e-05, "loss": 0.3157363414764404, "memory(GiB)": 72.41, "step": 2380, "token_acc": 0.8931401676158139, "train_speed(iter/s)": 0.037052 }, { "epoch": 1.6484629915850935, "grad_norm": 0.07777854800224304, "learning_rate": 1.9090293427342406e-05, "loss": 0.30912251472473146, "memory(GiB)": 72.41, "step": 2400, "token_acc": 0.8933260366449716, "train_speed(iter/s)": 0.037059 }, { "epoch": 1.6484629915850935, "eval_loss": 0.43367844820022583, "eval_runtime": 99.7879, "eval_samples_per_second": 37.72, "eval_steps_per_second": 0.591, "eval_token_acc": 0.843613657031226, "step": 2400 }, { "epoch": 1.6622016142881675, "grad_norm": 0.08155303448438644, "learning_rate": 1.893436460718056e-05, "loss": 0.3163402795791626, "memory(GiB)": 72.41, "step": 2420, "token_acc": 0.8672117073299662, "train_speed(iter/s)": 0.036975 }, { "epoch": 1.6759402369912415, "grad_norm": 0.08382421731948853, "learning_rate": 1.877797786935495e-05, "loss": 0.3165715217590332, "memory(GiB)": 72.41, "step": 2440, "token_acc": 0.8854491510650321, "train_speed(iter/s)": 0.03697 }, { "epoch": 1.6896788596943155, "grad_norm": 0.08178658783435822, "learning_rate": 1.862115141559744e-05, "loss": 0.3171123504638672, "memory(GiB)": 72.41, "step": 2460, "token_acc": 0.8836779780841286, "train_speed(iter/s)": 0.036973 }, { "epoch": 1.7034174823973895, "grad_norm": 0.07710675150156021, "learning_rate": 1.8463903498818088e-05, "loss": 0.31471326351165774, "memory(GiB)": 72.41, "step": 2480, "token_acc": 0.8908230830682876, "train_speed(iter/s)": 0.036976 }, { "epoch": 1.7171561051004636, "grad_norm": 0.08034602552652359, "learning_rate": 1.8306252420980704e-05, "loss": 0.31853632926940917, "memory(GiB)": 72.41, "step": 2500, "token_acc": 0.8883582169845952, "train_speed(iter/s)": 0.036978 }, { "epoch": 1.7308947278035376, "grad_norm": 0.07948100566864014, "learning_rate": 1.8148216530972714e-05, "loss": 0.3109827995300293, "memory(GiB)": 72.41, "step": 2520, "token_acc": 0.8949681174869483, "train_speed(iter/s)": 0.036981 }, { "epoch": 1.7446333505066116, "grad_norm": 0.07899336516857147, "learning_rate": 1.7989814222469538e-05, "loss": 0.3090771436691284, "memory(GiB)": 72.41, "step": 2540, "token_acc": 0.8906750005261931, "train_speed(iter/s)": 0.036983 }, { "epoch": 1.7583719732096856, "grad_norm": 0.07443471997976303, "learning_rate": 1.783106393179375e-05, "loss": 0.31173481941223147, "memory(GiB)": 72.41, "step": 2560, "token_acc": 0.8891939493597887, "train_speed(iter/s)": 0.036988 }, { "epoch": 1.7721105959127597, "grad_norm": 0.07555528730154037, "learning_rate": 1.767198413576931e-05, "loss": 0.30927410125732424, "memory(GiB)": 72.41, "step": 2580, "token_acc": 0.8850328545945815, "train_speed(iter/s)": 0.036994 }, { "epoch": 1.7858492186158337, "grad_norm": 0.08017897605895996, "learning_rate": 1.7512593349571046e-05, "loss": 0.31209754943847656, "memory(GiB)": 72.41, "step": 2600, "token_acc": 0.8816632260591382, "train_speed(iter/s)": 0.036998 }, { "epoch": 1.7995878413189077, "grad_norm": 0.07677578181028366, "learning_rate": 1.7352910124569695e-05, "loss": 0.30882983207702636, "memory(GiB)": 72.41, "step": 2620, "token_acc": 0.8925267013383078, "train_speed(iter/s)": 0.037 }, { "epoch": 1.8133264640219817, "grad_norm": 0.07692938297986984, "learning_rate": 1.7192953046172726e-05, "loss": 0.3074300289154053, "memory(GiB)": 72.41, "step": 2640, "token_acc": 0.8861350676140611, "train_speed(iter/s)": 0.037005 }, { "epoch": 1.8270650867250557, "grad_norm": 0.07619909197092056, "learning_rate": 1.7032740731661178e-05, "loss": 0.30927472114562987, "memory(GiB)": 72.41, "step": 2660, "token_acc": 0.8921508449028042, "train_speed(iter/s)": 0.037009 }, { "epoch": 1.8408037094281298, "grad_norm": 0.08186180889606476, "learning_rate": 1.687229182802284e-05, "loss": 0.3076324939727783, "memory(GiB)": 72.41, "step": 2680, "token_acc": 0.874112111934862, "train_speed(iter/s)": 0.037013 }, { "epoch": 1.8545423321312038, "grad_norm": 0.0749615728855133, "learning_rate": 1.6711625009781926e-05, "loss": 0.3025542736053467, "memory(GiB)": 72.41, "step": 2700, "token_acc": 0.9005740784776456, "train_speed(iter/s)": 0.037016 }, { "epoch": 1.8545423321312038, "eval_loss": 0.43260329961776733, "eval_runtime": 99.7086, "eval_samples_per_second": 37.75, "eval_steps_per_second": 0.592, "eval_token_acc": 0.8437799954640701, "step": 2700 }, { "epoch": 1.8682809548342778, "grad_norm": 0.07678617537021637, "learning_rate": 1.655075897682555e-05, "loss": 0.3069960117340088, "memory(GiB)": 72.41, "step": 2720, "token_acc": 0.8656336346071796, "train_speed(iter/s)": 0.036945 }, { "epoch": 1.8820195775373518, "grad_norm": 0.08224895596504211, "learning_rate": 1.6389712452227295e-05, "loss": 0.31150364875793457, "memory(GiB)": 72.41, "step": 2740, "token_acc": 0.8871026948734946, "train_speed(iter/s)": 0.036944 }, { "epoch": 1.8957582002404259, "grad_norm": 0.07674538344144821, "learning_rate": 1.6228504180068003e-05, "loss": 0.31361680030822753, "memory(GiB)": 72.41, "step": 2760, "token_acc": 0.8885069679173144, "train_speed(iter/s)": 0.036944 }, { "epoch": 1.9094968229434999, "grad_norm": 0.07724355906248093, "learning_rate": 1.60671529232542e-05, "loss": 0.31092076301574706, "memory(GiB)": 72.41, "step": 2780, "token_acc": 0.8775532573683428, "train_speed(iter/s)": 0.036948 }, { "epoch": 1.923235445646574, "grad_norm": 0.0748470202088356, "learning_rate": 1.5905677461334292e-05, "loss": 0.3125690698623657, "memory(GiB)": 72.41, "step": 2800, "token_acc": 0.8846377126342211, "train_speed(iter/s)": 0.036916 }, { "epoch": 1.936974068349648, "grad_norm": 0.08404634892940521, "learning_rate": 1.574409658831281e-05, "loss": 0.3153404235839844, "memory(GiB)": 72.41, "step": 2820, "token_acc": 0.8762131944710342, "train_speed(iter/s)": 0.036918 }, { "epoch": 1.950712691052722, "grad_norm": 0.07959295809268951, "learning_rate": 1.558242911046302e-05, "loss": 0.31249830722808836, "memory(GiB)": 72.41, "step": 2840, "token_acc": 0.8892035392544179, "train_speed(iter/s)": 0.036921 }, { "epoch": 1.964451313755796, "grad_norm": 0.08044803887605667, "learning_rate": 1.5420693844138036e-05, "loss": 0.3130341053009033, "memory(GiB)": 72.41, "step": 2860, "token_acc": 0.8932265094341124, "train_speed(iter/s)": 0.036924 }, { "epoch": 1.97818993645887, "grad_norm": 0.07583785802125931, "learning_rate": 1.525890961358083e-05, "loss": 0.3141756772994995, "memory(GiB)": 72.41, "step": 2880, "token_acc": 0.8839660044002189, "train_speed(iter/s)": 0.036928 }, { "epoch": 1.991928559161944, "grad_norm": 0.07464556396007538, "learning_rate": 1.5097095248733284e-05, "loss": 0.31082568168640134, "memory(GiB)": 72.41, "step": 2900, "token_acc": 0.8775183645838733, "train_speed(iter/s)": 0.036932 }, { "epoch": 2.00549544908123, "grad_norm": 0.12503303587436676, "learning_rate": 1.4935269583044581e-05, "loss": 0.2993995904922485, "memory(GiB)": 72.41, "step": 2920, "token_acc": 0.8823204490957476, "train_speed(iter/s)": 0.036937 }, { "epoch": 2.019234071784304, "grad_norm": 0.08332613110542297, "learning_rate": 1.4773451451279213e-05, "loss": 0.29198360443115234, "memory(GiB)": 72.41, "step": 2940, "token_acc": 0.8980348203187491, "train_speed(iter/s)": 0.036941 }, { "epoch": 2.032972694487378, "grad_norm": 0.08240070939064026, "learning_rate": 1.461165968732479e-05, "loss": 0.2935274362564087, "memory(GiB)": 72.41, "step": 2960, "token_acc": 0.8904662128095143, "train_speed(iter/s)": 0.036946 }, { "epoch": 2.046711317190452, "grad_norm": 0.08113058656454086, "learning_rate": 1.4449913122000005e-05, "loss": 0.29198508262634276, "memory(GiB)": 72.41, "step": 2980, "token_acc": 0.8908448858293387, "train_speed(iter/s)": 0.036953 }, { "epoch": 2.060449939893526, "grad_norm": 0.08105536550283432, "learning_rate": 1.4288230580862905e-05, "loss": 0.290987491607666, "memory(GiB)": 72.41, "step": 3000, "token_acc": 0.8764201959142056, "train_speed(iter/s)": 0.036958 }, { "epoch": 2.060449939893526, "eval_loss": 0.4393101930618286, "eval_runtime": 100.5668, "eval_samples_per_second": 37.428, "eval_steps_per_second": 0.587, "eval_token_acc": 0.84265782805066, "step": 3000 }, { "epoch": 2.0741885625966, "grad_norm": 0.08123844116926193, "learning_rate": 1.412663088201982e-05, "loss": 0.29090156555175783, "memory(GiB)": 72.41, "step": 3020, "token_acc": 0.8735638063478551, "train_speed(iter/s)": 0.036892 }, { "epoch": 2.087927185299674, "grad_norm": 0.0814339891076088, "learning_rate": 1.3965132833935126e-05, "loss": 0.2902204990386963, "memory(GiB)": 72.41, "step": 3040, "token_acc": 0.8839492383548759, "train_speed(iter/s)": 0.036893 }, { "epoch": 2.101665808002748, "grad_norm": 0.0760771632194519, "learning_rate": 1.380375523324215e-05, "loss": 0.29666552543640134, "memory(GiB)": 72.41, "step": 3060, "token_acc": 0.8921821581883145, "train_speed(iter/s)": 0.036894 }, { "epoch": 2.115404430705822, "grad_norm": 0.0808984562754631, "learning_rate": 1.3642516862555433e-05, "loss": 0.28961887359619143, "memory(GiB)": 72.41, "step": 3080, "token_acc": 0.8996385382943057, "train_speed(iter/s)": 0.036898 }, { "epoch": 2.129143053408896, "grad_norm": 0.08168598264455795, "learning_rate": 1.3481436488284648e-05, "loss": 0.2952747821807861, "memory(GiB)": 72.41, "step": 3100, "token_acc": 0.8932001882680204, "train_speed(iter/s)": 0.036901 }, { "epoch": 2.14288167611197, "grad_norm": 0.08384265005588531, "learning_rate": 1.3320532858450382e-05, "loss": 0.29767014980316164, "memory(GiB)": 72.41, "step": 3120, "token_acc": 0.8847414688023099, "train_speed(iter/s)": 0.036904 }, { "epoch": 2.156620298815044, "grad_norm": 0.08613137155771255, "learning_rate": 1.3159824700502083e-05, "loss": 0.2987870693206787, "memory(GiB)": 72.41, "step": 3140, "token_acc": 0.8912630847005318, "train_speed(iter/s)": 0.036909 }, { "epoch": 2.1703589215181176, "grad_norm": 0.08044470101594925, "learning_rate": 1.2999330719138363e-05, "loss": 0.29793477058410645, "memory(GiB)": 72.41, "step": 3160, "token_acc": 0.8911335210006078, "train_speed(iter/s)": 0.036914 }, { "epoch": 2.1840975442211916, "grad_norm": 0.0807594358921051, "learning_rate": 1.283906959413e-05, "loss": 0.2947986125946045, "memory(GiB)": 72.41, "step": 3180, "token_acc": 0.8855782459322568, "train_speed(iter/s)": 0.036917 }, { "epoch": 2.1978361669242656, "grad_norm": 0.08066173642873764, "learning_rate": 1.267905997814578e-05, "loss": 0.2977961778640747, "memory(GiB)": 72.41, "step": 3200, "token_acc": 0.8803478438446615, "train_speed(iter/s)": 0.036922 }, { "epoch": 2.2115747896273397, "grad_norm": 0.08045271784067154, "learning_rate": 1.2519320494581581e-05, "loss": 0.29424285888671875, "memory(GiB)": 72.41, "step": 3220, "token_acc": 0.8828892872837293, "train_speed(iter/s)": 0.036927 }, { "epoch": 2.2253134123304137, "grad_norm": 0.08237405866384506, "learning_rate": 1.2359869735392746e-05, "loss": 0.29676170349121095, "memory(GiB)": 72.41, "step": 3240, "token_acc": 0.8954581030873394, "train_speed(iter/s)": 0.036931 }, { "epoch": 2.2390520350334877, "grad_norm": 0.08597618341445923, "learning_rate": 1.220072625893023e-05, "loss": 0.296732759475708, "memory(GiB)": 72.41, "step": 3260, "token_acc": 0.8882645330425789, "train_speed(iter/s)": 0.036937 }, { "epoch": 2.2527906577365617, "grad_norm": 0.07742371410131454, "learning_rate": 1.2041908587780571e-05, "loss": 0.293271803855896, "memory(GiB)": 72.41, "step": 3280, "token_acc": 0.8859188183637006, "train_speed(iter/s)": 0.036942 }, { "epoch": 2.2665292804396358, "grad_norm": 0.07885393500328064, "learning_rate": 1.1883435206610095e-05, "loss": 0.29781594276428225, "memory(GiB)": 72.41, "step": 3300, "token_acc": 0.8873998820001778, "train_speed(iter/s)": 0.036945 }, { "epoch": 2.2665292804396358, "eval_loss": 0.4392697215080261, "eval_runtime": 99.0852, "eval_samples_per_second": 37.988, "eval_steps_per_second": 0.595, "eval_token_acc": 0.8427081339178593, "step": 3300 } ], "logging_steps": 20, "max_steps": 5824, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.596186893058048e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }