{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987419245154709, "eval_steps": 500, "global_step": 273, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01088065283917035, "grad_norm": 5.769476412265319, "learning_rate": 2.8571428571428573e-06, "loss": 0.8514, "step": 1 }, { "epoch": 0.0217613056783407, "grad_norm": 5.824165855722914, "learning_rate": 5.7142857142857145e-06, "loss": 0.8564, "step": 2 }, { "epoch": 0.032641958517511054, "grad_norm": 5.385924326553742, "learning_rate": 8.571428571428571e-06, "loss": 0.8354, "step": 3 }, { "epoch": 0.0435226113566814, "grad_norm": 2.307280123324516, "learning_rate": 1.1428571428571429e-05, "loss": 0.771, "step": 4 }, { "epoch": 0.05440326419585175, "grad_norm": 3.8643476531037586, "learning_rate": 1.4285714285714287e-05, "loss": 0.7531, "step": 5 }, { "epoch": 0.06528391703502211, "grad_norm": 4.185929449072588, "learning_rate": 1.7142857142857142e-05, "loss": 0.7605, "step": 6 }, { "epoch": 0.07616456987419246, "grad_norm": 4.238592120499111, "learning_rate": 2e-05, "loss": 0.7288, "step": 7 }, { "epoch": 0.0870452227133628, "grad_norm": 2.702549334633677, "learning_rate": 2.2857142857142858e-05, "loss": 0.7069, "step": 8 }, { "epoch": 0.09792587555253315, "grad_norm": 2.8265702023660415, "learning_rate": 2.5714285714285718e-05, "loss": 0.6744, "step": 9 }, { "epoch": 0.1088065283917035, "grad_norm": 2.1843367934470335, "learning_rate": 2.8571428571428574e-05, "loss": 0.6654, "step": 10 }, { "epoch": 0.11968718123087385, "grad_norm": 1.5063626985021334, "learning_rate": 3.142857142857143e-05, "loss": 0.6374, "step": 11 }, { "epoch": 0.13056783407004421, "grad_norm": 1.3685321573654194, "learning_rate": 3.4285714285714284e-05, "loss": 0.6326, "step": 12 }, { "epoch": 0.14144848690921455, "grad_norm": 1.4488139563790012, "learning_rate": 3.714285714285715e-05, "loss": 0.6164, "step": 13 }, { "epoch": 0.1523291397483849, "grad_norm": 1.1107459336385455, "learning_rate": 4e-05, "loss": 0.6096, "step": 14 }, { "epoch": 0.16320979258755525, "grad_norm": 1.4034801732959803, "learning_rate": 4.2857142857142856e-05, "loss": 0.5988, "step": 15 }, { "epoch": 0.1740904454267256, "grad_norm": 1.1173039797504758, "learning_rate": 4.5714285714285716e-05, "loss": 0.5979, "step": 16 }, { "epoch": 0.18497109826589594, "grad_norm": 1.3758839510979688, "learning_rate": 4.857142857142857e-05, "loss": 0.5824, "step": 17 }, { "epoch": 0.1958517511050663, "grad_norm": 0.9901489591062091, "learning_rate": 5.1428571428571436e-05, "loss": 0.5851, "step": 18 }, { "epoch": 0.20673240394423664, "grad_norm": 1.3298335358386597, "learning_rate": 5.4285714285714295e-05, "loss": 0.5798, "step": 19 }, { "epoch": 0.217613056783407, "grad_norm": 1.1614826605670152, "learning_rate": 5.714285714285715e-05, "loss": 0.5623, "step": 20 }, { "epoch": 0.22849370962257737, "grad_norm": 1.3114977506132772, "learning_rate": 6.000000000000001e-05, "loss": 0.5617, "step": 21 }, { "epoch": 0.2393743624617477, "grad_norm": 0.9156072730087671, "learning_rate": 6.285714285714286e-05, "loss": 0.554, "step": 22 }, { "epoch": 0.25025501530091804, "grad_norm": 1.5985352795964587, "learning_rate": 6.571428571428571e-05, "loss": 0.5728, "step": 23 }, { "epoch": 0.26113566814008843, "grad_norm": 1.347383875892142, "learning_rate": 6.857142857142857e-05, "loss": 0.5663, "step": 24 }, { "epoch": 0.27201632097925876, "grad_norm": 0.8364475716969787, "learning_rate": 7.142857142857143e-05, "loss": 0.5589, "step": 25 }, { "epoch": 0.2828969738184291, "grad_norm": 1.6877024422160516, "learning_rate": 7.42857142857143e-05, "loss": 0.5561, "step": 26 }, { "epoch": 0.29377762665759943, "grad_norm": 0.8544602518620728, "learning_rate": 7.714285714285715e-05, "loss": 0.5371, "step": 27 }, { "epoch": 0.3046582794967698, "grad_norm": 1.10746206935203, "learning_rate": 8e-05, "loss": 0.5482, "step": 28 }, { "epoch": 0.31553893233594016, "grad_norm": 0.8901965542271334, "learning_rate": 7.999671154713278e-05, "loss": 0.5468, "step": 29 }, { "epoch": 0.3264195851751105, "grad_norm": 1.3171275211566689, "learning_rate": 7.99868467292272e-05, "loss": 0.5484, "step": 30 }, { "epoch": 0.3373002380142809, "grad_norm": 1.119133823137938, "learning_rate": 7.997040716828271e-05, "loss": 0.5391, "step": 31 }, { "epoch": 0.3481808908534512, "grad_norm": 766.2890164269288, "learning_rate": 7.994739556733538e-05, "loss": 7.0977, "step": 32 }, { "epoch": 0.35906154369262155, "grad_norm": 360.8841002867855, "learning_rate": 7.991781571001347e-05, "loss": 8.9951, "step": 33 }, { "epoch": 0.3699421965317919, "grad_norm": 378.0503338631233, "learning_rate": 7.988167245991528e-05, "loss": 6.6897, "step": 34 }, { "epoch": 0.3808228493709623, "grad_norm": 7.672002821902226, "learning_rate": 7.983897175980957e-05, "loss": 0.7981, "step": 35 }, { "epoch": 0.3917035022101326, "grad_norm": 4.42825926220236, "learning_rate": 7.97897206306583e-05, "loss": 0.7036, "step": 36 }, { "epoch": 0.40258415504930295, "grad_norm": 13.3824847890278, "learning_rate": 7.973392717046233e-05, "loss": 0.6454, "step": 37 }, { "epoch": 0.4134648078884733, "grad_norm": 4.184331419144958, "learning_rate": 7.967160055292984e-05, "loss": 0.6386, "step": 38 }, { "epoch": 0.4243454607276437, "grad_norm": 2.5999303493722135, "learning_rate": 7.960275102596809e-05, "loss": 0.6128, "step": 39 }, { "epoch": 0.435226113566814, "grad_norm": 5.547292352391667, "learning_rate": 7.952738990999824e-05, "loss": 0.5773, "step": 40 }, { "epoch": 0.44610676640598435, "grad_norm": 2.1944412304611167, "learning_rate": 7.94455295960942e-05, "loss": 0.569, "step": 41 }, { "epoch": 0.45698741924515474, "grad_norm": 1.3921897468618165, "learning_rate": 7.93571835439452e-05, "loss": 0.5501, "step": 42 }, { "epoch": 0.46786807208432507, "grad_norm": 1.3666229580107998, "learning_rate": 7.926236627964262e-05, "loss": 0.5486, "step": 43 }, { "epoch": 0.4787487249234954, "grad_norm": 1.2707740638081064, "learning_rate": 7.916109339329173e-05, "loss": 0.5452, "step": 44 }, { "epoch": 0.48962937776266574, "grad_norm": 0.7728181787873867, "learning_rate": 7.905338153644818e-05, "loss": 0.5349, "step": 45 }, { "epoch": 0.5005100306018361, "grad_norm": 1.081424627354813, "learning_rate": 7.89392484193802e-05, "loss": 0.5359, "step": 46 }, { "epoch": 0.5113906834410065, "grad_norm": 0.8451219565877732, "learning_rate": 7.881871280815659e-05, "loss": 0.5192, "step": 47 }, { "epoch": 0.5222713362801769, "grad_norm": 0.9838361376918019, "learning_rate": 7.869179452156118e-05, "loss": 0.523, "step": 48 }, { "epoch": 0.5331519891193471, "grad_norm": 0.7178144668508193, "learning_rate": 7.855851442783414e-05, "loss": 0.5277, "step": 49 }, { "epoch": 0.5440326419585175, "grad_norm": 0.8653884854595517, "learning_rate": 7.841889444124078e-05, "loss": 0.5171, "step": 50 }, { "epoch": 0.5549132947976878, "grad_norm": 0.9615871234590915, "learning_rate": 7.827295751846836e-05, "loss": 0.5228, "step": 51 }, { "epoch": 0.5657939476368582, "grad_norm": 0.6109242817024244, "learning_rate": 7.81207276548515e-05, "loss": 0.5176, "step": 52 }, { "epoch": 0.5766746004760286, "grad_norm": 0.7262379927240948, "learning_rate": 7.796222988042676e-05, "loss": 0.5173, "step": 53 }, { "epoch": 0.5875552533151989, "grad_norm": 0.8000849152573422, "learning_rate": 7.779749025581717e-05, "loss": 0.5112, "step": 54 }, { "epoch": 0.5984359061543693, "grad_norm": 0.6837286870058386, "learning_rate": 7.762653586794731e-05, "loss": 0.5207, "step": 55 }, { "epoch": 0.6093165589935396, "grad_norm": 0.502300421602433, "learning_rate": 7.74493948255895e-05, "loss": 0.5023, "step": 56 }, { "epoch": 0.6201972118327099, "grad_norm": 0.5742765191011999, "learning_rate": 7.726609625474218e-05, "loss": 0.493, "step": 57 }, { "epoch": 0.6310778646718803, "grad_norm": 0.7752392361844105, "learning_rate": 7.707667029384088e-05, "loss": 0.5002, "step": 58 }, { "epoch": 0.6419585175110507, "grad_norm": 0.8003464724621063, "learning_rate": 7.688114808880283e-05, "loss": 0.5014, "step": 59 }, { "epoch": 0.652839170350221, "grad_norm": 0.4467202145391146, "learning_rate": 7.667956178790582e-05, "loss": 0.4932, "step": 60 }, { "epoch": 0.6637198231893914, "grad_norm": 0.47406542774154264, "learning_rate": 7.647194453650228e-05, "loss": 0.5052, "step": 61 }, { "epoch": 0.6746004760285618, "grad_norm": 0.61628828690436, "learning_rate": 7.625833047156953e-05, "loss": 0.48, "step": 62 }, { "epoch": 0.685481128867732, "grad_norm": 0.6613121807693437, "learning_rate": 7.603875471609677e-05, "loss": 0.496, "step": 63 }, { "epoch": 0.6963617817069024, "grad_norm": 0.5547782762077185, "learning_rate": 7.581325337331013e-05, "loss": 0.4961, "step": 64 }, { "epoch": 0.7072424345460727, "grad_norm": 0.4876382130153157, "learning_rate": 7.558186352073648e-05, "loss": 0.4835, "step": 65 }, { "epoch": 0.7181230873852431, "grad_norm": 0.5304127406212, "learning_rate": 7.534462320410702e-05, "loss": 0.4935, "step": 66 }, { "epoch": 0.7290037402244135, "grad_norm": 0.41710180085319953, "learning_rate": 7.510157143110172e-05, "loss": 0.4906, "step": 67 }, { "epoch": 0.7398843930635838, "grad_norm": 0.48357676956883017, "learning_rate": 7.485274816493558e-05, "loss": 0.4924, "step": 68 }, { "epoch": 0.7507650459027542, "grad_norm": 0.5917098078277447, "learning_rate": 7.459819431778775e-05, "loss": 0.4947, "step": 69 }, { "epoch": 0.7616456987419246, "grad_norm": 0.7595830676091241, "learning_rate": 7.433795174407465e-05, "loss": 0.4817, "step": 70 }, { "epoch": 0.7725263515810948, "grad_norm": 0.7936800209738104, "learning_rate": 7.407206323356818e-05, "loss": 0.4918, "step": 71 }, { "epoch": 0.7834070044202652, "grad_norm": 0.7864537130215999, "learning_rate": 7.380057250436006e-05, "loss": 0.4887, "step": 72 }, { "epoch": 0.7942876572594356, "grad_norm": 0.49916613247579816, "learning_rate": 7.352352419567362e-05, "loss": 0.4816, "step": 73 }, { "epoch": 0.8051683100986059, "grad_norm": 0.41101864085328815, "learning_rate": 7.324096386052416e-05, "loss": 0.485, "step": 74 }, { "epoch": 0.8160489629377763, "grad_norm": 0.656161004995155, "learning_rate": 7.295293795822887e-05, "loss": 0.4744, "step": 75 }, { "epoch": 0.8269296157769466, "grad_norm": 0.5394326858320477, "learning_rate": 7.265949384676795e-05, "loss": 0.4722, "step": 76 }, { "epoch": 0.837810268616117, "grad_norm": 0.2642185151465239, "learning_rate": 7.236067977499791e-05, "loss": 0.4807, "step": 77 }, { "epoch": 0.8486909214552874, "grad_norm": 0.5422041951823006, "learning_rate": 7.205654487471826e-05, "loss": 0.4797, "step": 78 }, { "epoch": 0.8595715742944576, "grad_norm": 0.5501247208421062, "learning_rate": 7.174713915259331e-05, "loss": 0.475, "step": 79 }, { "epoch": 0.870452227133628, "grad_norm": 0.4027113374681404, "learning_rate": 7.143251348192971e-05, "loss": 0.4677, "step": 80 }, { "epoch": 0.8813328799727984, "grad_norm": 0.46847225987486674, "learning_rate": 7.111271959431189e-05, "loss": 0.4793, "step": 81 }, { "epoch": 0.8922135328119687, "grad_norm": 0.37608598585235453, "learning_rate": 7.078781007109625e-05, "loss": 0.4695, "step": 82 }, { "epoch": 0.9030941856511391, "grad_norm": 0.3314250240053766, "learning_rate": 7.045783833476538e-05, "loss": 0.4794, "step": 83 }, { "epoch": 0.9139748384903095, "grad_norm": 0.35538493046088, "learning_rate": 7.012285864014445e-05, "loss": 0.4758, "step": 84 }, { "epoch": 0.9248554913294798, "grad_norm": 0.40899275410289987, "learning_rate": 6.978292606548029e-05, "loss": 0.4716, "step": 85 }, { "epoch": 0.9357361441686501, "grad_norm": 0.39602646601922714, "learning_rate": 6.943809650338541e-05, "loss": 0.4703, "step": 86 }, { "epoch": 0.9466167970078204, "grad_norm": 0.32981646879813115, "learning_rate": 6.908842665164789e-05, "loss": 0.4665, "step": 87 }, { "epoch": 0.9574974498469908, "grad_norm": 0.45700638514003344, "learning_rate": 6.873397400390911e-05, "loss": 0.4718, "step": 88 }, { "epoch": 0.9683781026861612, "grad_norm": 0.4952520819569817, "learning_rate": 6.837479684021032e-05, "loss": 0.4726, "step": 89 }, { "epoch": 0.9792587555253315, "grad_norm": 0.480434270208596, "learning_rate": 6.80109542174102e-05, "loss": 0.4699, "step": 90 }, { "epoch": 0.9901394083645019, "grad_norm": 0.593562930442879, "learning_rate": 6.76425059594746e-05, "loss": 0.4788, "step": 91 }, { "epoch": 1.009520571234274, "grad_norm": 0.6664418776633385, "learning_rate": 6.726951264763998e-05, "loss": 0.456, "step": 92 }, { "epoch": 1.0204012240734444, "grad_norm": 0.6935514470470976, "learning_rate": 6.689203561045268e-05, "loss": 0.4545, "step": 93 }, { "epoch": 1.0312818769126149, "grad_norm": 0.8262173326620568, "learning_rate": 6.651013691368492e-05, "loss": 0.4589, "step": 94 }, { "epoch": 1.0421625297517851, "grad_norm": 0.9951587995620871, "learning_rate": 6.612387935012995e-05, "loss": 0.4594, "step": 95 }, { "epoch": 1.0530431825909554, "grad_norm": 0.9844429089044019, "learning_rate": 6.573332642927737e-05, "loss": 0.4514, "step": 96 }, { "epoch": 1.063923835430126, "grad_norm": 0.6991926155223892, "learning_rate": 6.53385423668708e-05, "loss": 0.4416, "step": 97 }, { "epoch": 1.0748044882692962, "grad_norm": 0.585309533517111, "learning_rate": 6.493959207434934e-05, "loss": 0.4497, "step": 98 }, { "epoch": 1.0856851411084665, "grad_norm": 0.7423320704031524, "learning_rate": 6.453654114817467e-05, "loss": 0.457, "step": 99 }, { "epoch": 1.0965657939476368, "grad_norm": 0.6185657582865267, "learning_rate": 6.412945585904545e-05, "loss": 0.4481, "step": 100 }, { "epoch": 1.1074464467868073, "grad_norm": 0.45496645094782234, "learning_rate": 6.371840314100104e-05, "loss": 0.4514, "step": 101 }, { "epoch": 1.1183270996259775, "grad_norm": 0.6039336404495378, "learning_rate": 6.330345058041585e-05, "loss": 0.4583, "step": 102 }, { "epoch": 1.1292077524651478, "grad_norm": 0.4358480651096581, "learning_rate": 6.288466640488679e-05, "loss": 0.4374, "step": 103 }, { "epoch": 1.1400884053043183, "grad_norm": 0.43724639626696055, "learning_rate": 6.2462119472015e-05, "loss": 0.4428, "step": 104 }, { "epoch": 1.1509690581434886, "grad_norm": 0.5729837222594828, "learning_rate": 6.20358792580841e-05, "loss": 0.4471, "step": 105 }, { "epoch": 1.1618497109826589, "grad_norm": 0.39024335737726285, "learning_rate": 6.160601584663681e-05, "loss": 0.4453, "step": 106 }, { "epoch": 1.1727303638218294, "grad_norm": 0.3646061376356188, "learning_rate": 6.11725999169515e-05, "loss": 0.4491, "step": 107 }, { "epoch": 1.1836110166609997, "grad_norm": 0.46982607381941577, "learning_rate": 6.0735702732421015e-05, "loss": 0.4443, "step": 108 }, { "epoch": 1.19449166950017, "grad_norm": 0.29189646752552, "learning_rate": 6.029539612883529e-05, "loss": 0.4402, "step": 109 }, { "epoch": 1.2053723223393404, "grad_norm": 0.3520773279039917, "learning_rate": 5.9851752502570015e-05, "loss": 0.435, "step": 110 }, { "epoch": 1.2162529751785107, "grad_norm": 0.32075025375118393, "learning_rate": 5.940484479868288e-05, "loss": 0.4462, "step": 111 }, { "epoch": 1.227133628017681, "grad_norm": 0.2882478429202779, "learning_rate": 5.895474649891995e-05, "loss": 0.4421, "step": 112 }, { "epoch": 1.2380142808568515, "grad_norm": 0.31925565019590013, "learning_rate": 5.8501531609633424e-05, "loss": 0.446, "step": 113 }, { "epoch": 1.2488949336960218, "grad_norm": 0.21223179457423266, "learning_rate": 5.8045274649613386e-05, "loss": 0.4448, "step": 114 }, { "epoch": 1.259775586535192, "grad_norm": 0.26953627133631786, "learning_rate": 5.7586050637835295e-05, "loss": 0.4374, "step": 115 }, { "epoch": 1.2706562393743623, "grad_norm": 0.21178369672133335, "learning_rate": 5.7123935081125034e-05, "loss": 0.4458, "step": 116 }, { "epoch": 1.2815368922135328, "grad_norm": 0.21155767077821688, "learning_rate": 5.6659003961743965e-05, "loss": 0.4376, "step": 117 }, { "epoch": 1.2924175450527031, "grad_norm": 0.21347423268768667, "learning_rate": 5.619133372489575e-05, "loss": 0.4515, "step": 118 }, { "epoch": 1.3032981978918734, "grad_norm": 0.21576745513388285, "learning_rate": 5.572100126615695e-05, "loss": 0.4443, "step": 119 }, { "epoch": 1.314178850731044, "grad_norm": 0.24546247378355868, "learning_rate": 5.524808391883367e-05, "loss": 0.441, "step": 120 }, { "epoch": 1.3250595035702142, "grad_norm": 0.2207361069513953, "learning_rate": 5.477265944124626e-05, "loss": 0.4354, "step": 121 }, { "epoch": 1.3359401564093845, "grad_norm": 0.2444333045464523, "learning_rate": 5.429480600394405e-05, "loss": 0.4407, "step": 122 }, { "epoch": 1.346820809248555, "grad_norm": 0.24749025049340784, "learning_rate": 5.381460217685231e-05, "loss": 0.4359, "step": 123 }, { "epoch": 1.3577014620877252, "grad_norm": 0.22381053800753611, "learning_rate": 5.333212691635368e-05, "loss": 0.4347, "step": 124 }, { "epoch": 1.3685821149268955, "grad_norm": 0.24301738733290523, "learning_rate": 5.2847459552305834e-05, "loss": 0.4337, "step": 125 }, { "epoch": 1.379462767766066, "grad_norm": 0.24671775018284153, "learning_rate": 5.23606797749979e-05, "loss": 0.4384, "step": 126 }, { "epoch": 1.3903434206052363, "grad_norm": 0.23843656791767695, "learning_rate": 5.1871867622047624e-05, "loss": 0.4444, "step": 127 }, { "epoch": 1.4012240734444066, "grad_norm": 0.18481585189252123, "learning_rate": 5.13811034652413e-05, "loss": 0.4371, "step": 128 }, { "epoch": 1.412104726283577, "grad_norm": 0.18518123274811235, "learning_rate": 5.088846799731885e-05, "loss": 0.4342, "step": 129 }, { "epoch": 1.4229853791227474, "grad_norm": 0.2010634600287658, "learning_rate": 5.039404221870612e-05, "loss": 0.4296, "step": 130 }, { "epoch": 1.4338660319619176, "grad_norm": 0.23061272903823807, "learning_rate": 4.989790742419658e-05, "loss": 0.4415, "step": 131 }, { "epoch": 1.4447466848010881, "grad_norm": 0.2507778238204534, "learning_rate": 4.940014518958461e-05, "loss": 0.4338, "step": 132 }, { "epoch": 1.4556273376402584, "grad_norm": 0.27522736481625426, "learning_rate": 4.890083735825258e-05, "loss": 0.4397, "step": 133 }, { "epoch": 1.4665079904794287, "grad_norm": 0.2145456230157738, "learning_rate": 4.8400066027713974e-05, "loss": 0.4271, "step": 134 }, { "epoch": 1.4773886433185992, "grad_norm": 0.20238631194211945, "learning_rate": 4.789791353611469e-05, "loss": 0.4229, "step": 135 }, { "epoch": 1.4882692961577695, "grad_norm": 0.17925376141197102, "learning_rate": 4.7394462448694756e-05, "loss": 0.4383, "step": 136 }, { "epoch": 1.4991499489969398, "grad_norm": 0.16639345435419553, "learning_rate": 4.688979554421276e-05, "loss": 0.4307, "step": 137 }, { "epoch": 1.5100306018361103, "grad_norm": 0.16546944794615542, "learning_rate": 4.6383995801335176e-05, "loss": 0.4413, "step": 138 }, { "epoch": 1.5209112546752805, "grad_norm": 0.22823688961718305, "learning_rate": 4.5877146384992725e-05, "loss": 0.4343, "step": 139 }, { "epoch": 1.5317919075144508, "grad_norm": 0.2077914352935504, "learning_rate": 4.5369330632706223e-05, "loss": 0.4304, "step": 140 }, { "epoch": 1.5426725603536213, "grad_norm": 0.178090044195881, "learning_rate": 4.486063204088402e-05, "loss": 0.433, "step": 141 }, { "epoch": 1.5535532131927916, "grad_norm": 0.17467896850970988, "learning_rate": 4.435113425109324e-05, "loss": 0.4298, "step": 142 }, { "epoch": 1.5644338660319619, "grad_norm": 0.17004530557039635, "learning_rate": 4.3840921036307274e-05, "loss": 0.4348, "step": 143 }, { "epoch": 1.5753145188711324, "grad_norm": 0.1956158514979538, "learning_rate": 4.333007628713158e-05, "loss": 0.4384, "step": 144 }, { "epoch": 1.5861951717103027, "grad_norm": 0.16196619070156484, "learning_rate": 4.281868399801016e-05, "loss": 0.4362, "step": 145 }, { "epoch": 1.597075824549473, "grad_norm": 0.15774918769209736, "learning_rate": 4.230682825341498e-05, "loss": 0.4321, "step": 146 }, { "epoch": 1.6079564773886434, "grad_norm": 0.14677757963300414, "learning_rate": 4.17945932140206e-05, "loss": 0.4343, "step": 147 }, { "epoch": 1.6188371302278135, "grad_norm": 0.1586788180866913, "learning_rate": 4.128206310286622e-05, "loss": 0.4319, "step": 148 }, { "epoch": 1.629717783066984, "grad_norm": 0.15657539290671857, "learning_rate": 4.0769322191507485e-05, "loss": 0.4349, "step": 149 }, { "epoch": 1.6405984359061545, "grad_norm": 0.13771725448321517, "learning_rate": 4.025645478616045e-05, "loss": 0.4286, "step": 150 }, { "epoch": 1.6514790887453246, "grad_norm": 0.1792608152635085, "learning_rate": 3.974354521383956e-05, "loss": 0.4326, "step": 151 }, { "epoch": 1.662359741584495, "grad_norm": 0.19697920039522113, "learning_rate": 3.923067780849252e-05, "loss": 0.4325, "step": 152 }, { "epoch": 1.6732403944236656, "grad_norm": 0.15631987727039992, "learning_rate": 3.87179368971338e-05, "loss": 0.4269, "step": 153 }, { "epoch": 1.6841210472628356, "grad_norm": 0.1789741559362441, "learning_rate": 3.820540678597942e-05, "loss": 0.4352, "step": 154 }, { "epoch": 1.6950017001020061, "grad_norm": 0.14589986696711105, "learning_rate": 3.769317174658503e-05, "loss": 0.4331, "step": 155 }, { "epoch": 1.7058823529411766, "grad_norm": 0.16739347857930229, "learning_rate": 3.718131600198984e-05, "loss": 0.4385, "step": 156 }, { "epoch": 1.7167630057803467, "grad_norm": 0.15981306017402697, "learning_rate": 3.666992371286843e-05, "loss": 0.4313, "step": 157 }, { "epoch": 1.7276436586195172, "grad_norm": 0.15235627870495205, "learning_rate": 3.615907896369273e-05, "loss": 0.442, "step": 158 }, { "epoch": 1.7385243114586875, "grad_norm": 0.15094599321638194, "learning_rate": 3.564886574890677e-05, "loss": 0.4376, "step": 159 }, { "epoch": 1.7494049642978577, "grad_norm": 0.15338762976356135, "learning_rate": 3.5139367959115986e-05, "loss": 0.438, "step": 160 }, { "epoch": 1.7602856171370282, "grad_norm": 0.13609508940355913, "learning_rate": 3.4630669367293797e-05, "loss": 0.4301, "step": 161 }, { "epoch": 1.7711662699761985, "grad_norm": 0.14408065680405582, "learning_rate": 3.412285361500729e-05, "loss": 0.4365, "step": 162 }, { "epoch": 1.7820469228153688, "grad_norm": 0.1523591488642166, "learning_rate": 3.3616004198664845e-05, "loss": 0.4261, "step": 163 }, { "epoch": 1.7929275756545393, "grad_norm": 0.16103662054938223, "learning_rate": 3.311020445578725e-05, "loss": 0.4358, "step": 164 }, { "epoch": 1.8038082284937096, "grad_norm": 0.1713601588521581, "learning_rate": 3.260553755130525e-05, "loss": 0.4317, "step": 165 }, { "epoch": 1.8146888813328799, "grad_norm": 0.15595643791635083, "learning_rate": 3.210208646388532e-05, "loss": 0.4201, "step": 166 }, { "epoch": 1.8255695341720504, "grad_norm": 0.14708760810268764, "learning_rate": 3.1599933972286026e-05, "loss": 0.4296, "step": 167 }, { "epoch": 1.8364501870112206, "grad_norm": 0.13811567230657157, "learning_rate": 3.109916264174743e-05, "loss": 0.4252, "step": 168 }, { "epoch": 1.847330839850391, "grad_norm": 0.14763414231896016, "learning_rate": 3.0599854810415393e-05, "loss": 0.4201, "step": 169 }, { "epoch": 1.8582114926895614, "grad_norm": 0.129668633060907, "learning_rate": 3.0102092575803435e-05, "loss": 0.4343, "step": 170 }, { "epoch": 1.8690921455287317, "grad_norm": 0.13079136624725912, "learning_rate": 2.9605957781293893e-05, "loss": 0.4388, "step": 171 }, { "epoch": 1.879972798367902, "grad_norm": 0.14118617253700078, "learning_rate": 2.911153200268116e-05, "loss": 0.4361, "step": 172 }, { "epoch": 1.8908534512070725, "grad_norm": 0.13385452899697334, "learning_rate": 2.8618896534758707e-05, "loss": 0.4303, "step": 173 }, { "epoch": 1.9017341040462428, "grad_norm": 0.14708401774021865, "learning_rate": 2.8128132377952376e-05, "loss": 0.4332, "step": 174 }, { "epoch": 1.912614756885413, "grad_norm": 0.12272534113340453, "learning_rate": 2.7639320225002108e-05, "loss": 0.4228, "step": 175 }, { "epoch": 1.9234954097245835, "grad_norm": 0.1138558266069995, "learning_rate": 2.715254044769418e-05, "loss": 0.4282, "step": 176 }, { "epoch": 1.9343760625637538, "grad_norm": 0.12343156100233148, "learning_rate": 2.666787308364634e-05, "loss": 0.4284, "step": 177 }, { "epoch": 1.945256715402924, "grad_norm": 0.11224315249119837, "learning_rate": 2.6185397823147703e-05, "loss": 0.4265, "step": 178 }, { "epoch": 1.9561373682420946, "grad_norm": 0.10896139436338177, "learning_rate": 2.5705193996055977e-05, "loss": 0.4255, "step": 179 }, { "epoch": 1.967018021081265, "grad_norm": 0.12908408500196814, "learning_rate": 2.5227340558753755e-05, "loss": 0.4267, "step": 180 }, { "epoch": 1.9778986739204352, "grad_norm": 0.11019554007619542, "learning_rate": 2.4751916081166336e-05, "loss": 0.4322, "step": 181 }, { "epoch": 1.9887793267596057, "grad_norm": 0.1233996372590572, "learning_rate": 2.427899873384306e-05, "loss": 0.4237, "step": 182 }, { "epoch": 2.0081604896293777, "grad_norm": 0.15359648389426395, "learning_rate": 2.3808666275104248e-05, "loss": 0.4132, "step": 183 }, { "epoch": 2.019041142468548, "grad_norm": 0.13345360808541243, "learning_rate": 2.334099603825605e-05, "loss": 0.4088, "step": 184 }, { "epoch": 2.0299217953077187, "grad_norm": 0.17411332047982292, "learning_rate": 2.2876064918874993e-05, "loss": 0.4049, "step": 185 }, { "epoch": 2.0408024481468887, "grad_norm": 0.1422404399378093, "learning_rate": 2.241394936216472e-05, "loss": 0.3949, "step": 186 }, { "epoch": 2.051683100986059, "grad_norm": 0.14694793749020102, "learning_rate": 2.1954725350386614e-05, "loss": 0.4004, "step": 187 }, { "epoch": 2.0625637538252297, "grad_norm": 0.1408354112834221, "learning_rate": 2.14984683903666e-05, "loss": 0.4047, "step": 188 }, { "epoch": 2.0734444066643998, "grad_norm": 0.13765680224710142, "learning_rate": 2.1045253501080058e-05, "loss": 0.4029, "step": 189 }, { "epoch": 2.0843250595035703, "grad_norm": 0.13251592143019264, "learning_rate": 2.0595155201317115e-05, "loss": 0.4033, "step": 190 }, { "epoch": 2.095205712342741, "grad_norm": 0.13758159475310247, "learning_rate": 2.0148247497430012e-05, "loss": 0.4035, "step": 191 }, { "epoch": 2.106086365181911, "grad_norm": 0.12620065204508432, "learning_rate": 1.970460387116472e-05, "loss": 0.4003, "step": 192 }, { "epoch": 2.1169670180210813, "grad_norm": 0.13100037610614787, "learning_rate": 1.9264297267579e-05, "loss": 0.4, "step": 193 }, { "epoch": 2.127847670860252, "grad_norm": 0.1342586056051866, "learning_rate": 1.8827400083048503e-05, "loss": 0.4032, "step": 194 }, { "epoch": 2.138728323699422, "grad_norm": 0.1314403186569449, "learning_rate": 1.8393984153363203e-05, "loss": 0.4084, "step": 195 }, { "epoch": 2.1496089765385924, "grad_norm": 0.1275916516562326, "learning_rate": 1.7964120741915905e-05, "loss": 0.4021, "step": 196 }, { "epoch": 2.1604896293777625, "grad_norm": 0.12979778271394132, "learning_rate": 1.753788052798501e-05, "loss": 0.4063, "step": 197 }, { "epoch": 2.171370282216933, "grad_norm": 0.13548411500192925, "learning_rate": 1.7115333595113225e-05, "loss": 0.4093, "step": 198 }, { "epoch": 2.1822509350561035, "grad_norm": 0.11424114716502747, "learning_rate": 1.669654941958416e-05, "loss": 0.3998, "step": 199 }, { "epoch": 2.1931315878952735, "grad_norm": 0.14575557443845474, "learning_rate": 1.628159685899897e-05, "loss": 0.404, "step": 200 }, { "epoch": 2.204012240734444, "grad_norm": 0.12565905473323868, "learning_rate": 1.5870544140954543e-05, "loss": 0.4029, "step": 201 }, { "epoch": 2.2148928935736145, "grad_norm": 0.1322140532060304, "learning_rate": 1.5463458851825345e-05, "loss": 0.4034, "step": 202 }, { "epoch": 2.2257735464127846, "grad_norm": 0.11539342585698967, "learning_rate": 1.5060407925650662e-05, "loss": 0.4047, "step": 203 }, { "epoch": 2.236654199251955, "grad_norm": 0.110043261615137, "learning_rate": 1.466145763312922e-05, "loss": 0.4028, "step": 204 }, { "epoch": 2.2475348520911256, "grad_norm": 0.12462346384772446, "learning_rate": 1.426667357072265e-05, "loss": 0.4057, "step": 205 }, { "epoch": 2.2584155049302956, "grad_norm": 0.11246035528357057, "learning_rate": 1.3876120649870051e-05, "loss": 0.401, "step": 206 }, { "epoch": 2.269296157769466, "grad_norm": 0.11025464415618055, "learning_rate": 1.3489863086315085e-05, "loss": 0.406, "step": 207 }, { "epoch": 2.2801768106086366, "grad_norm": 0.12202834262583902, "learning_rate": 1.3107964389547326e-05, "loss": 0.4105, "step": 208 }, { "epoch": 2.2910574634478067, "grad_norm": 0.10891021403569234, "learning_rate": 1.2730487352360026e-05, "loss": 0.3986, "step": 209 }, { "epoch": 2.301938116286977, "grad_norm": 0.11067574316750495, "learning_rate": 1.2357494040525416e-05, "loss": 0.4026, "step": 210 }, { "epoch": 2.3128187691261477, "grad_norm": 0.12013626538786122, "learning_rate": 1.1989045782589815e-05, "loss": 0.4019, "step": 211 }, { "epoch": 2.3236994219653178, "grad_norm": 0.11734274809139549, "learning_rate": 1.1625203159789686e-05, "loss": 0.404, "step": 212 }, { "epoch": 2.3345800748044883, "grad_norm": 0.13868399399365283, "learning_rate": 1.1266025996090902e-05, "loss": 0.3968, "step": 213 }, { "epoch": 2.3454607276436588, "grad_norm": 0.1021758536629292, "learning_rate": 1.0911573348352107e-05, "loss": 0.3997, "step": 214 }, { "epoch": 2.356341380482829, "grad_norm": 0.11161156337434158, "learning_rate": 1.0561903496614603e-05, "loss": 0.4019, "step": 215 }, { "epoch": 2.3672220333219993, "grad_norm": 0.11628871391170659, "learning_rate": 1.0217073934519726e-05, "loss": 0.3961, "step": 216 }, { "epoch": 2.37810268616117, "grad_norm": 0.10053214623418112, "learning_rate": 9.877141359855567e-06, "loss": 0.4107, "step": 217 }, { "epoch": 2.38898333900034, "grad_norm": 0.10333880551211917, "learning_rate": 9.542161665234623e-06, "loss": 0.4041, "step": 218 }, { "epoch": 2.3998639918395104, "grad_norm": 0.10378698888338332, "learning_rate": 9.212189928903758e-06, "loss": 0.4057, "step": 219 }, { "epoch": 2.410744644678681, "grad_norm": 0.10095387706934292, "learning_rate": 8.887280405688106e-06, "loss": 0.4035, "step": 220 }, { "epoch": 2.421625297517851, "grad_norm": 0.09702668485060174, "learning_rate": 8.567486518070306e-06, "loss": 0.3979, "step": 221 }, { "epoch": 2.4325059503570214, "grad_norm": 0.0968990526407044, "learning_rate": 8.252860847406712e-06, "loss": 0.4007, "step": 222 }, { "epoch": 2.443386603196192, "grad_norm": 0.09758367833082218, "learning_rate": 7.943455125281741e-06, "loss": 0.4038, "step": 223 }, { "epoch": 2.454267256035362, "grad_norm": 0.09335426131514203, "learning_rate": 7.639320225002106e-06, "loss": 0.3987, "step": 224 }, { "epoch": 2.4651479088745325, "grad_norm": 0.09620155060950364, "learning_rate": 7.340506153232052e-06, "loss": 0.4051, "step": 225 }, { "epoch": 2.476028561713703, "grad_norm": 0.09238550728932193, "learning_rate": 7.047062041771133e-06, "loss": 0.4038, "step": 226 }, { "epoch": 2.486909214552873, "grad_norm": 0.08653478796623289, "learning_rate": 6.759036139475843e-06, "loss": 0.4035, "step": 227 }, { "epoch": 2.4977898673920436, "grad_norm": 0.09462132238499406, "learning_rate": 6.476475804326377e-06, "loss": 0.3945, "step": 228 }, { "epoch": 2.508670520231214, "grad_norm": 0.09364175398439001, "learning_rate": 6.199427495639963e-06, "loss": 0.3953, "step": 229 }, { "epoch": 2.519551173070384, "grad_norm": 0.09533512853144702, "learning_rate": 5.927936766431836e-06, "loss": 0.3951, "step": 230 }, { "epoch": 2.5304318259095546, "grad_norm": 0.08988259771164411, "learning_rate": 5.662048255925357e-06, "loss": 0.4007, "step": 231 }, { "epoch": 2.5413124787487247, "grad_norm": 0.08544449128239766, "learning_rate": 5.40180568221226e-06, "loss": 0.3956, "step": 232 }, { "epoch": 2.552193131587895, "grad_norm": 0.08214234004836413, "learning_rate": 5.147251835064424e-06, "loss": 0.401, "step": 233 }, { "epoch": 2.5630737844270657, "grad_norm": 0.08978715763135, "learning_rate": 4.898428568898288e-06, "loss": 0.4018, "step": 234 }, { "epoch": 2.573954437266236, "grad_norm": 0.09111850134473688, "learning_rate": 4.65537679589299e-06, "loss": 0.4033, "step": 235 }, { "epoch": 2.5848350901054062, "grad_norm": 0.08281874038672186, "learning_rate": 4.418136479263533e-06, "loss": 0.3957, "step": 236 }, { "epoch": 2.5957157429445767, "grad_norm": 0.0875486805676823, "learning_rate": 4.186746626689879e-06, "loss": 0.3978, "step": 237 }, { "epoch": 2.606596395783747, "grad_norm": 0.08327809325190605, "learning_rate": 3.961245283903239e-06, "loss": 0.4004, "step": 238 }, { "epoch": 2.6174770486229173, "grad_norm": 0.08331308067123125, "learning_rate": 3.7416695284304737e-06, "loss": 0.4062, "step": 239 }, { "epoch": 2.628357701462088, "grad_norm": 0.08388783642844128, "learning_rate": 3.5280554634977217e-06, "loss": 0.3974, "step": 240 }, { "epoch": 2.6392383543012583, "grad_norm": 0.08349690695940903, "learning_rate": 3.320438212094197e-06, "loss": 0.4021, "step": 241 }, { "epoch": 2.6501190071404284, "grad_norm": 0.08518098898214005, "learning_rate": 3.1188519111971804e-06, "loss": 0.4046, "step": 242 }, { "epoch": 2.660999659979599, "grad_norm": 0.08404039117187238, "learning_rate": 2.9233297061591346e-06, "loss": 0.3986, "step": 243 }, { "epoch": 2.671880312818769, "grad_norm": 0.08468392011098298, "learning_rate": 2.733903745257838e-06, "loss": 0.4143, "step": 244 }, { "epoch": 2.6827609656579394, "grad_norm": 0.08035934775357019, "learning_rate": 2.550605174410512e-06, "loss": 0.3982, "step": 245 }, { "epoch": 2.69364161849711, "grad_norm": 0.08100249829782087, "learning_rate": 2.373464132052701e-06, "loss": 0.3975, "step": 246 }, { "epoch": 2.7045222713362804, "grad_norm": 0.08405527434090163, "learning_rate": 2.202509744182835e-06, "loss": 0.3958, "step": 247 }, { "epoch": 2.7154029241754505, "grad_norm": 0.08458090883428918, "learning_rate": 2.0377701195732545e-06, "loss": 0.4094, "step": 248 }, { "epoch": 2.726283577014621, "grad_norm": 0.08120194628003821, "learning_rate": 1.879272345148513e-06, "loss": 0.4071, "step": 249 }, { "epoch": 2.737164229853791, "grad_norm": 0.08381423080148898, "learning_rate": 1.727042481531651e-06, "loss": 0.3997, "step": 250 }, { "epoch": 2.7480448826929615, "grad_norm": 0.08003721871020845, "learning_rate": 1.5811055587592283e-06, "loss": 0.4032, "step": 251 }, { "epoch": 2.758925535532132, "grad_norm": 0.07906608009605874, "learning_rate": 1.4414855721658705e-06, "loss": 0.4011, "step": 252 }, { "epoch": 2.7698061883713025, "grad_norm": 0.07989568893361568, "learning_rate": 1.3082054784388221e-06, "loss": 0.3938, "step": 253 }, { "epoch": 2.7806868412104726, "grad_norm": 0.07531713942027396, "learning_rate": 1.1812871918434143e-06, "loss": 0.4036, "step": 254 }, { "epoch": 2.791567494049643, "grad_norm": 0.07349157785840128, "learning_rate": 1.0607515806198142e-06, "loss": 0.3975, "step": 255 }, { "epoch": 2.802448146888813, "grad_norm": 0.07248419100011405, "learning_rate": 9.466184635518361e-07, "loss": 0.397, "step": 256 }, { "epoch": 2.8133287997279837, "grad_norm": 0.07667024498059592, "learning_rate": 8.389066067082852e-07, "loss": 0.4011, "step": 257 }, { "epoch": 2.824209452567154, "grad_norm": 0.07642598936379766, "learning_rate": 7.376337203573824e-07, "loss": 0.4, "step": 258 }, { "epoch": 2.8350901054063242, "grad_norm": 0.07461923023970361, "learning_rate": 6.428164560548134e-07, "loss": 0.4021, "step": 259 }, { "epoch": 2.8459707582454947, "grad_norm": 0.08011069712526726, "learning_rate": 5.544704039058025e-07, "loss": 0.4001, "step": 260 }, { "epoch": 2.8568514110846652, "grad_norm": 0.0778371155244566, "learning_rate": 4.7261009000177274e-07, "loss": 0.4025, "step": 261 }, { "epoch": 2.8677320639238353, "grad_norm": 0.0756170688560216, "learning_rate": 3.972489740319274e-07, "loss": 0.4091, "step": 262 }, { "epoch": 2.878612716763006, "grad_norm": 0.0747265642036147, "learning_rate": 3.283994470701579e-07, "loss": 0.4055, "step": 263 }, { "epoch": 2.8894933696021763, "grad_norm": 0.07119073646902688, "learning_rate": 2.66072829537678e-07, "loss": 0.408, "step": 264 }, { "epoch": 2.9003740224413463, "grad_norm": 0.07649012974948233, "learning_rate": 2.102793693417038e-07, "loss": 0.402, "step": 265 }, { "epoch": 2.911254675280517, "grad_norm": 0.07588643659629785, "learning_rate": 1.6102824019043728e-07, "loss": 0.3985, "step": 266 }, { "epoch": 2.9221353281196873, "grad_norm": 0.07431991189730484, "learning_rate": 1.1832754008472614e-07, "loss": 0.4025, "step": 267 }, { "epoch": 2.9330159809588574, "grad_norm": 0.0777086423840054, "learning_rate": 8.21842899865466e-08, "loss": 0.3962, "step": 268 }, { "epoch": 2.943896633798028, "grad_norm": 0.07387694283814854, "learning_rate": 5.260443266462467e-08, "loss": 0.3982, "step": 269 }, { "epoch": 2.9547772866371984, "grad_norm": 0.07271598906861958, "learning_rate": 2.9592831717293326e-08, "loss": 0.3988, "step": 270 }, { "epoch": 2.9656579394763685, "grad_norm": 0.07496499490626318, "learning_rate": 1.3153270772807702e-08, "loss": 0.4009, "step": 271 }, { "epoch": 2.976538592315539, "grad_norm": 0.07330139638176505, "learning_rate": 3.2884528672294523e-09, "loss": 0.4001, "step": 272 }, { "epoch": 2.987419245154709, "grad_norm": 0.07348531822682038, "learning_rate": 0.0, "loss": 0.4048, "step": 273 }, { "epoch": 2.987419245154709, "step": 273, "total_flos": 7.259120111838036e+18, "train_loss": 0.5428441533675561, "train_runtime": 82839.4057, "train_samples_per_second": 1.704, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 273, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.259120111838036e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }