{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0011983223487118, "eval_steps": 500, "global_step": 20040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009986019572598363, "grad_norm": 14.6875, "learning_rate": 5.625e-06, "loss": 1.2935, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10, "tokens_per_second_per_gpu": 2433.25 }, { "epoch": 0.0019972039145196726, "grad_norm": 4.875, "learning_rate": 1.1875e-05, "loss": 0.854, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20, "tokens_per_second_per_gpu": 2456.09 }, { "epoch": 0.0029958058717795086, "grad_norm": 3.765625, "learning_rate": 1.8125e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 30, "tokens_per_second_per_gpu": 2727.46 }, { "epoch": 0.003994407829039345, "grad_norm": 5.1875, "learning_rate": 2.4375e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 40, "tokens_per_second_per_gpu": 2542.63 }, { "epoch": 0.004993009786299181, "grad_norm": 4.125, "learning_rate": 2.4999996883431864e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 50, "tokens_per_second_per_gpu": 2345.48 }, { "epoch": 0.005991611743559017, "grad_norm": 3.375, "learning_rate": 2.499998611011191e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 60, "tokens_per_second_per_gpu": 2522.77 }, { "epoch": 0.006990213700818854, "grad_norm": 3.6875, "learning_rate": 2.4999967641570623e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 70, "tokens_per_second_per_gpu": 2549.09 }, { "epoch": 0.00798881565807869, "grad_norm": 5.0625, "learning_rate": 2.4999941477819366e-05, "loss": 0.655, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 80, "tokens_per_second_per_gpu": 2546.51 }, { "epoch": 0.008987417615338526, "grad_norm": 3.984375, "learning_rate": 2.499990761887425e-05, "loss": 0.5784, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 90, "tokens_per_second_per_gpu": 2584.81 }, { "epoch": 0.009986019572598362, "grad_norm": 3.515625, "learning_rate": 2.4999866064756117e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 100, "tokens_per_second_per_gpu": 2478.64 }, { "epoch": 0.010984621529858199, "grad_norm": 4.03125, "learning_rate": 2.499981681549055e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 110, "tokens_per_second_per_gpu": 2465.76 }, { "epoch": 0.011983223487118035, "grad_norm": 4.34375, "learning_rate": 2.4999759871107865e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 120, "tokens_per_second_per_gpu": 2387.83 }, { "epoch": 0.01298182544437787, "grad_norm": 4.75, "learning_rate": 2.4999695231643118e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 130, "tokens_per_second_per_gpu": 2498.78 }, { "epoch": 0.013980427401637708, "grad_norm": 3.6875, "learning_rate": 2.499962289713611e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 140, "tokens_per_second_per_gpu": 2553.78 }, { "epoch": 0.014979029358897543, "grad_norm": 4.15625, "learning_rate": 2.499954286763136e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 150, "tokens_per_second_per_gpu": 2560.78 }, { "epoch": 0.01597763131615738, "grad_norm": 3.421875, "learning_rate": 2.4999455143178143e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 160, "tokens_per_second_per_gpu": 2449.63 }, { "epoch": 0.016976233273417216, "grad_norm": 4.1875, "learning_rate": 2.499935972383046e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 170, "tokens_per_second_per_gpu": 2483.81 }, { "epoch": 0.017974835230677052, "grad_norm": 4.28125, "learning_rate": 2.499925660964706e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 180, "tokens_per_second_per_gpu": 2676.57 }, { "epoch": 0.018973437187936888, "grad_norm": 3.34375, "learning_rate": 2.4999145800691415e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 190, "tokens_per_second_per_gpu": 2547.88 }, { "epoch": 0.019972039145196723, "grad_norm": 4.25, "learning_rate": 2.4999027297031743e-05, "loss": 0.6962, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 200, "tokens_per_second_per_gpu": 2651.86 }, { "epoch": 0.020970641102456562, "grad_norm": 3.5, "learning_rate": 2.4998901098740997e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 210, "tokens_per_second_per_gpu": 2523.71 }, { "epoch": 0.021969243059716398, "grad_norm": 3.734375, "learning_rate": 2.4998767205896865e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 220, "tokens_per_second_per_gpu": 2453.45 }, { "epoch": 0.022967845016976234, "grad_norm": 3.53125, "learning_rate": 2.499862561858178e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 230, "tokens_per_second_per_gpu": 2649.66 }, { "epoch": 0.02396644697423607, "grad_norm": 5.21875, "learning_rate": 2.49984763368829e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 240, "tokens_per_second_per_gpu": 2473.23 }, { "epoch": 0.024965048931495905, "grad_norm": 3.359375, "learning_rate": 2.499831936089213e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 250, "tokens_per_second_per_gpu": 2570.46 }, { "epoch": 0.02596365088875574, "grad_norm": 4.65625, "learning_rate": 2.49981546907061e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 260, "tokens_per_second_per_gpu": 2503.01 }, { "epoch": 0.02696225284601558, "grad_norm": 3.5, "learning_rate": 2.499798232642619e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 270, "tokens_per_second_per_gpu": 2626.8 }, { "epoch": 0.027960854803275415, "grad_norm": 4.8125, "learning_rate": 2.499780226815851e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 280, "tokens_per_second_per_gpu": 2601.43 }, { "epoch": 0.02895945676053525, "grad_norm": 3.484375, "learning_rate": 2.4997614516013902e-05, "loss": 0.5671, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 290, "tokens_per_second_per_gpu": 2545.89 }, { "epoch": 0.029958058717795086, "grad_norm": 3.46875, "learning_rate": 2.499741907010796e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 300, "tokens_per_second_per_gpu": 2372.26 }, { "epoch": 0.030956660675054922, "grad_norm": 4.0, "learning_rate": 2.4997215930560997e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 310, "tokens_per_second_per_gpu": 2587.61 }, { "epoch": 0.03195526263231476, "grad_norm": 3.625, "learning_rate": 2.4997005097498068e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 320, "tokens_per_second_per_gpu": 2526.99 }, { "epoch": 0.0329538645895746, "grad_norm": 3.984375, "learning_rate": 2.499678657104897e-05, "loss": 0.613, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 330, "tokens_per_second_per_gpu": 2517.74 }, { "epoch": 0.03395246654683443, "grad_norm": 3.375, "learning_rate": 2.499656035134823e-05, "loss": 0.6758, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 340, "tokens_per_second_per_gpu": 2381.37 }, { "epoch": 0.03495106850409427, "grad_norm": 3.484375, "learning_rate": 2.499632643853511e-05, "loss": 0.5786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 350, "tokens_per_second_per_gpu": 2478.29 }, { "epoch": 0.035949670461354104, "grad_norm": 3.671875, "learning_rate": 2.4996084832753617e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 360, "tokens_per_second_per_gpu": 2411.29 }, { "epoch": 0.03694827241861394, "grad_norm": 4.0625, "learning_rate": 2.4995835534152486e-05, "loss": 0.5763, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 370, "tokens_per_second_per_gpu": 2577.58 }, { "epoch": 0.037946874375873775, "grad_norm": 3.75, "learning_rate": 2.499557854288519e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 380, "tokens_per_second_per_gpu": 2664.57 }, { "epoch": 0.03894547633313361, "grad_norm": 3.09375, "learning_rate": 2.4995313859109933e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 390, "tokens_per_second_per_gpu": 2496.57 }, { "epoch": 0.039944078290393446, "grad_norm": 3.796875, "learning_rate": 2.4995041482989668e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 400, "tokens_per_second_per_gpu": 2388.44 }, { "epoch": 0.04094268024765328, "grad_norm": 3.78125, "learning_rate": 2.4994761414692064e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 410, "tokens_per_second_per_gpu": 2590.92 }, { "epoch": 0.041941282204913125, "grad_norm": 3.234375, "learning_rate": 2.4994473654389542e-05, "loss": 0.5739, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 420, "tokens_per_second_per_gpu": 2513.79 }, { "epoch": 0.04293988416217296, "grad_norm": 3.796875, "learning_rate": 2.4994178202259255e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 430, "tokens_per_second_per_gpu": 2555.06 }, { "epoch": 0.043938486119432796, "grad_norm": 2.625, "learning_rate": 2.4993875058483085e-05, "loss": 0.5553, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 440, "tokens_per_second_per_gpu": 2507.61 }, { "epoch": 0.04493708807669263, "grad_norm": 3.28125, "learning_rate": 2.4993564223247655e-05, "loss": 0.5436, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 450, "tokens_per_second_per_gpu": 2448.31 }, { "epoch": 0.04593569003395247, "grad_norm": 4.875, "learning_rate": 2.4993245696744318e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 460, "tokens_per_second_per_gpu": 2441.27 }, { "epoch": 0.0469342919912123, "grad_norm": 3.109375, "learning_rate": 2.499291947916917e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 470, "tokens_per_second_per_gpu": 2348.09 }, { "epoch": 0.04793289394847214, "grad_norm": 4.03125, "learning_rate": 2.4992585570723025e-05, "loss": 0.5154, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 480, "tokens_per_second_per_gpu": 2521.61 }, { "epoch": 0.048931495905731974, "grad_norm": 3.015625, "learning_rate": 2.499224397161146e-05, "loss": 0.577, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 490, "tokens_per_second_per_gpu": 2723.84 }, { "epoch": 0.04993009786299181, "grad_norm": 2.828125, "learning_rate": 2.4991894682044757e-05, "loss": 0.608, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 500, "tokens_per_second_per_gpu": 2604.08 }, { "epoch": 0.050928699820251645, "grad_norm": 3.171875, "learning_rate": 2.4991537702237948e-05, "loss": 0.5497, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 510, "tokens_per_second_per_gpu": 2482.06 }, { "epoch": 0.05192730177751148, "grad_norm": 3.640625, "learning_rate": 2.4991173032410796e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 520, "tokens_per_second_per_gpu": 2453.18 }, { "epoch": 0.052925903734771323, "grad_norm": 3.34375, "learning_rate": 2.4990800672787805e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 530, "tokens_per_second_per_gpu": 2533.46 }, { "epoch": 0.05392450569203116, "grad_norm": 2.734375, "learning_rate": 2.4990420623598197e-05, "loss": 0.5488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 540, "tokens_per_second_per_gpu": 2447.38 }, { "epoch": 0.054923107649290995, "grad_norm": 3.109375, "learning_rate": 2.4990032885075945e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 550, "tokens_per_second_per_gpu": 2593.13 }, { "epoch": 0.05592170960655083, "grad_norm": 3.34375, "learning_rate": 2.498963745745974e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 560, "tokens_per_second_per_gpu": 2345.76 }, { "epoch": 0.056920311563810666, "grad_norm": 4.875, "learning_rate": 2.4989234340993026e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 570, "tokens_per_second_per_gpu": 2451.35 }, { "epoch": 0.0579189135210705, "grad_norm": 3.65625, "learning_rate": 2.4988823535923956e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 580, "tokens_per_second_per_gpu": 2638.89 }, { "epoch": 0.05891751547833034, "grad_norm": 3.953125, "learning_rate": 2.4988405042505434e-05, "loss": 0.591, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 590, "tokens_per_second_per_gpu": 2659.59 }, { "epoch": 0.05991611743559017, "grad_norm": 4.21875, "learning_rate": 2.4987978860995098e-05, "loss": 0.641, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 600, "tokens_per_second_per_gpu": 2304.14 }, { "epoch": 0.06091471939285001, "grad_norm": 3.9375, "learning_rate": 2.498754499165531e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 610, "tokens_per_second_per_gpu": 2461.25 }, { "epoch": 0.061913321350109844, "grad_norm": 3.9375, "learning_rate": 2.4987103434753163e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 620, "tokens_per_second_per_gpu": 2578.47 }, { "epoch": 0.06291192330736968, "grad_norm": 4.03125, "learning_rate": 2.498665419056049e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 630, "tokens_per_second_per_gpu": 2607.5 }, { "epoch": 0.06391052526462952, "grad_norm": 2.859375, "learning_rate": 2.4986197259353855e-05, "loss": 0.532, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 640, "tokens_per_second_per_gpu": 2560.27 }, { "epoch": 0.06490912722188935, "grad_norm": 2.921875, "learning_rate": 2.4985732641414555e-05, "loss": 0.5259, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 650, "tokens_per_second_per_gpu": 2467.38 }, { "epoch": 0.0659077291791492, "grad_norm": 3.578125, "learning_rate": 2.4985260337028613e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 660, "tokens_per_second_per_gpu": 2394.48 }, { "epoch": 0.06690633113640902, "grad_norm": 3.765625, "learning_rate": 2.4984780346486793e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 670, "tokens_per_second_per_gpu": 2536.05 }, { "epoch": 0.06790493309366886, "grad_norm": 3.734375, "learning_rate": 2.4984292670084582e-05, "loss": 0.4979, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 680, "tokens_per_second_per_gpu": 2580.76 }, { "epoch": 0.0689035350509287, "grad_norm": 4.1875, "learning_rate": 2.4983797308122205e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 690, "tokens_per_second_per_gpu": 2450.58 }, { "epoch": 0.06990213700818854, "grad_norm": 2.734375, "learning_rate": 2.4983294260904615e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 700, "tokens_per_second_per_gpu": 2511.56 }, { "epoch": 0.07090073896544838, "grad_norm": 3.203125, "learning_rate": 2.49827835287415e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 710, "tokens_per_second_per_gpu": 2516.19 }, { "epoch": 0.07189934092270821, "grad_norm": 3.734375, "learning_rate": 2.4982265111947273e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 720, "tokens_per_second_per_gpu": 2545.0 }, { "epoch": 0.07289794287996805, "grad_norm": 3.640625, "learning_rate": 2.4981739010841078e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 730, "tokens_per_second_per_gpu": 2567.94 }, { "epoch": 0.07389654483722788, "grad_norm": 3.328125, "learning_rate": 2.49812052257468e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 740, "tokens_per_second_per_gpu": 2595.01 }, { "epoch": 0.07489514679448772, "grad_norm": 3.96875, "learning_rate": 2.4980663756993042e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 750, "tokens_per_second_per_gpu": 2628.94 }, { "epoch": 0.07589374875174755, "grad_norm": 3.53125, "learning_rate": 2.498011460491314e-05, "loss": 0.5795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 760, "tokens_per_second_per_gpu": 2616.04 }, { "epoch": 0.07689235070900739, "grad_norm": 4.03125, "learning_rate": 2.497955776984517e-05, "loss": 0.5619, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 770, "tokens_per_second_per_gpu": 2441.7 }, { "epoch": 0.07789095266626722, "grad_norm": 4.28125, "learning_rate": 2.497899325213192e-05, "loss": 0.5717, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 780, "tokens_per_second_per_gpu": 2460.02 }, { "epoch": 0.07888955462352706, "grad_norm": 3.53125, "learning_rate": 2.4978421052120928e-05, "loss": 0.5271, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 790, "tokens_per_second_per_gpu": 2416.47 }, { "epoch": 0.07988815658078689, "grad_norm": 3.03125, "learning_rate": 2.497784117016444e-05, "loss": 0.5435, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 800, "tokens_per_second_per_gpu": 2378.95 }, { "epoch": 0.08088675853804674, "grad_norm": 3.03125, "learning_rate": 2.497725360661945e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 810, "tokens_per_second_per_gpu": 2548.34 }, { "epoch": 0.08188536049530656, "grad_norm": 3.671875, "learning_rate": 2.497665836184767e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 820, "tokens_per_second_per_gpu": 2410.81 }, { "epoch": 0.0828839624525664, "grad_norm": 3.3125, "learning_rate": 2.4976055436215544e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 830, "tokens_per_second_per_gpu": 2564.2 }, { "epoch": 0.08388256440982625, "grad_norm": 3.84375, "learning_rate": 2.4975444830094245e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 840, "tokens_per_second_per_gpu": 2407.54 }, { "epoch": 0.08488116636708608, "grad_norm": 3.28125, "learning_rate": 2.4974826543859674e-05, "loss": 0.5788, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 850, "tokens_per_second_per_gpu": 2543.54 }, { "epoch": 0.08587976832434592, "grad_norm": 4.3125, "learning_rate": 2.4974200577892452e-05, "loss": 0.7068, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 860, "tokens_per_second_per_gpu": 2471.1 }, { "epoch": 0.08687837028160575, "grad_norm": 3.40625, "learning_rate": 2.4973566932577947e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 870, "tokens_per_second_per_gpu": 2520.86 }, { "epoch": 0.08787697223886559, "grad_norm": 3.734375, "learning_rate": 2.4972925608306233e-05, "loss": 0.646, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 880, "tokens_per_second_per_gpu": 2564.43 }, { "epoch": 0.08887557419612542, "grad_norm": 3.203125, "learning_rate": 2.4972276605472126e-05, "loss": 0.5778, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 890, "tokens_per_second_per_gpu": 2537.1 }, { "epoch": 0.08987417615338526, "grad_norm": 3.078125, "learning_rate": 2.4971619924475162e-05, "loss": 0.5407, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 900, "tokens_per_second_per_gpu": 2533.92 }, { "epoch": 0.09087277811064509, "grad_norm": 3.828125, "learning_rate": 2.4970955565719606e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 910, "tokens_per_second_per_gpu": 2553.55 }, { "epoch": 0.09187138006790493, "grad_norm": 3.375, "learning_rate": 2.4970283529614452e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 920, "tokens_per_second_per_gpu": 2478.03 }, { "epoch": 0.09286998202516476, "grad_norm": 2.546875, "learning_rate": 2.4969603816573416e-05, "loss": 0.5222, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 930, "tokens_per_second_per_gpu": 2625.45 }, { "epoch": 0.0938685839824246, "grad_norm": 3.625, "learning_rate": 2.496891642701494e-05, "loss": 0.5492, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 940, "tokens_per_second_per_gpu": 2548.76 }, { "epoch": 0.09486718593968445, "grad_norm": 4.40625, "learning_rate": 2.49682213613622e-05, "loss": 0.5474, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 950, "tokens_per_second_per_gpu": 2533.71 }, { "epoch": 0.09586578789694428, "grad_norm": 3.203125, "learning_rate": 2.496751862004308e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 960, "tokens_per_second_per_gpu": 2528.55 }, { "epoch": 0.09686438985420412, "grad_norm": 3.265625, "learning_rate": 2.496680820349021e-05, "loss": 0.5566, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 970, "tokens_per_second_per_gpu": 2739.19 }, { "epoch": 0.09786299181146395, "grad_norm": 4.25, "learning_rate": 2.4966090112140932e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 980, "tokens_per_second_per_gpu": 2651.78 }, { "epoch": 0.09886159376872379, "grad_norm": 2.828125, "learning_rate": 2.4965364346437317e-05, "loss": 0.5078, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 990, "tokens_per_second_per_gpu": 2628.16 }, { "epoch": 0.09986019572598362, "grad_norm": 3.6875, "learning_rate": 2.496463090682616e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1000, "tokens_per_second_per_gpu": 2624.44 }, { "epoch": 0.10085879768324346, "grad_norm": 4.03125, "learning_rate": 2.496388979375898e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1010, "tokens_per_second_per_gpu": 2494.73 }, { "epoch": 0.10185739964050329, "grad_norm": 3.734375, "learning_rate": 2.4963141007692022e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1020, "tokens_per_second_per_gpu": 2569.75 }, { "epoch": 0.10285600159776313, "grad_norm": 2.828125, "learning_rate": 2.496238454908624e-05, "loss": 0.534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1030, "tokens_per_second_per_gpu": 2503.62 }, { "epoch": 0.10385460355502296, "grad_norm": 4.09375, "learning_rate": 2.4961620418407343e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1040, "tokens_per_second_per_gpu": 2576.86 }, { "epoch": 0.1048532055122828, "grad_norm": 3.984375, "learning_rate": 2.4960848616125733e-05, "loss": 0.622, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1050, "tokens_per_second_per_gpu": 2616.67 }, { "epoch": 0.10585180746954265, "grad_norm": 3.0, "learning_rate": 2.4960069142716544e-05, "loss": 0.5688, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1060, "tokens_per_second_per_gpu": 2562.4 }, { "epoch": 0.10685040942680248, "grad_norm": 2.984375, "learning_rate": 2.495928199865964e-05, "loss": 0.5633, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1070, "tokens_per_second_per_gpu": 2546.41 }, { "epoch": 0.10784901138406232, "grad_norm": 3.9375, "learning_rate": 2.4958487184439597e-05, "loss": 0.4996, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1080, "tokens_per_second_per_gpu": 2592.55 }, { "epoch": 0.10884761334132215, "grad_norm": 4.28125, "learning_rate": 2.495768470054572e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1090, "tokens_per_second_per_gpu": 2584.93 }, { "epoch": 0.10984621529858199, "grad_norm": 2.921875, "learning_rate": 2.495687454747203e-05, "loss": 0.5217, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1100, "tokens_per_second_per_gpu": 2649.48 }, { "epoch": 0.11084481725584182, "grad_norm": 2.8125, "learning_rate": 2.4956056725717276e-05, "loss": 0.5152, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1110, "tokens_per_second_per_gpu": 2571.54 }, { "epoch": 0.11184341921310166, "grad_norm": 3.453125, "learning_rate": 2.4955231235784922e-05, "loss": 0.5394, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1120, "tokens_per_second_per_gpu": 2618.3 }, { "epoch": 0.11284202117036149, "grad_norm": 2.5, "learning_rate": 2.4954398078183154e-05, "loss": 0.5768, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1130, "tokens_per_second_per_gpu": 2456.04 }, { "epoch": 0.11384062312762133, "grad_norm": 3.71875, "learning_rate": 2.4953557253424882e-05, "loss": 0.5728, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1140, "tokens_per_second_per_gpu": 2504.75 }, { "epoch": 0.11483922508488116, "grad_norm": 3.546875, "learning_rate": 2.4952708762027727e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1150, "tokens_per_second_per_gpu": 2646.76 }, { "epoch": 0.115837827042141, "grad_norm": 3.453125, "learning_rate": 2.495185260451404e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1160, "tokens_per_second_per_gpu": 2543.13 }, { "epoch": 0.11683642899940083, "grad_norm": 2.828125, "learning_rate": 2.4950988781410892e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1170, "tokens_per_second_per_gpu": 2634.88 }, { "epoch": 0.11783503095666067, "grad_norm": 2.796875, "learning_rate": 2.495011729325006e-05, "loss": 0.5553, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1180, "tokens_per_second_per_gpu": 2636.98 }, { "epoch": 0.11883363291392052, "grad_norm": 3.125, "learning_rate": 2.4949238140568054e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1190, "tokens_per_second_per_gpu": 2433.71 }, { "epoch": 0.11983223487118035, "grad_norm": 3.03125, "learning_rate": 2.4948351323906093e-05, "loss": 0.5565, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1200, "tokens_per_second_per_gpu": 2425.42 }, { "epoch": 0.12083083682844019, "grad_norm": 3.875, "learning_rate": 2.494745684381012e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1210, "tokens_per_second_per_gpu": 2579.97 }, { "epoch": 0.12182943878570002, "grad_norm": 3.8125, "learning_rate": 2.494655470083079e-05, "loss": 0.5549, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1220, "tokens_per_second_per_gpu": 2587.4 }, { "epoch": 0.12282804074295986, "grad_norm": 4.53125, "learning_rate": 2.4945644895523487e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1230, "tokens_per_second_per_gpu": 2518.03 }, { "epoch": 0.12382664270021969, "grad_norm": 3.390625, "learning_rate": 2.4944727428448295e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1240, "tokens_per_second_per_gpu": 2612.22 }, { "epoch": 0.12482524465747953, "grad_norm": 2.65625, "learning_rate": 2.4943802300170027e-05, "loss": 0.587, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1250, "tokens_per_second_per_gpu": 2553.1 }, { "epoch": 0.12582384661473936, "grad_norm": 3.671875, "learning_rate": 2.494286951125821e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1260, "tokens_per_second_per_gpu": 2586.72 }, { "epoch": 0.1268224485719992, "grad_norm": 2.5, "learning_rate": 2.494192906228708e-05, "loss": 0.5679, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1270, "tokens_per_second_per_gpu": 2514.0 }, { "epoch": 0.12782105052925904, "grad_norm": 4.53125, "learning_rate": 2.4940980953835602e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1280, "tokens_per_second_per_gpu": 2554.42 }, { "epoch": 0.1288196524865189, "grad_norm": 3.5625, "learning_rate": 2.494002518648745e-05, "loss": 0.5308, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1290, "tokens_per_second_per_gpu": 2339.2 }, { "epoch": 0.1298182544437787, "grad_norm": 3.375, "learning_rate": 2.4939061760831007e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1300, "tokens_per_second_per_gpu": 2681.19 }, { "epoch": 0.13081685640103854, "grad_norm": 2.65625, "learning_rate": 2.4938090677459374e-05, "loss": 0.5446, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1310, "tokens_per_second_per_gpu": 2655.93 }, { "epoch": 0.1318154583582984, "grad_norm": 3.109375, "learning_rate": 2.493711193697037e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1320, "tokens_per_second_per_gpu": 2547.55 }, { "epoch": 0.13281406031555823, "grad_norm": 3.359375, "learning_rate": 2.493612553996653e-05, "loss": 0.5614, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1330, "tokens_per_second_per_gpu": 2681.78 }, { "epoch": 0.13381266227281804, "grad_norm": 3.234375, "learning_rate": 2.4935131487055094e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1340, "tokens_per_second_per_gpu": 2502.16 }, { "epoch": 0.1348112642300779, "grad_norm": 3.578125, "learning_rate": 2.493412977884802e-05, "loss": 0.5644, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1350, "tokens_per_second_per_gpu": 2587.13 }, { "epoch": 0.13580986618733773, "grad_norm": 3.75, "learning_rate": 2.4933120415961975e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1360, "tokens_per_second_per_gpu": 2491.03 }, { "epoch": 0.13680846814459757, "grad_norm": 3.75, "learning_rate": 2.4932103399018346e-05, "loss": 0.646, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1370, "tokens_per_second_per_gpu": 2662.49 }, { "epoch": 0.1378070701018574, "grad_norm": 3.5, "learning_rate": 2.4931078728643226e-05, "loss": 0.534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1380, "tokens_per_second_per_gpu": 2355.97 }, { "epoch": 0.13880567205911723, "grad_norm": 3.03125, "learning_rate": 2.493004640546742e-05, "loss": 0.5726, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1390, "tokens_per_second_per_gpu": 2610.61 }, { "epoch": 0.13980427401637707, "grad_norm": 2.828125, "learning_rate": 2.4929006430126445e-05, "loss": 0.5112, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1400, "tokens_per_second_per_gpu": 2532.1 }, { "epoch": 0.14080287597363692, "grad_norm": 3.9375, "learning_rate": 2.492795880326053e-05, "loss": 0.5487, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1410, "tokens_per_second_per_gpu": 2326.36 }, { "epoch": 0.14180147793089676, "grad_norm": 3.5625, "learning_rate": 2.492690352551461e-05, "loss": 0.5554, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1420, "tokens_per_second_per_gpu": 2442.66 }, { "epoch": 0.14280007988815657, "grad_norm": 2.984375, "learning_rate": 2.4925840597538343e-05, "loss": 0.5526, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1430, "tokens_per_second_per_gpu": 2556.92 }, { "epoch": 0.14379868184541642, "grad_norm": 3.96875, "learning_rate": 2.4924770019986075e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1440, "tokens_per_second_per_gpu": 2374.16 }, { "epoch": 0.14479728380267626, "grad_norm": 3.0, "learning_rate": 2.492369179351688e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1450, "tokens_per_second_per_gpu": 2619.86 }, { "epoch": 0.1457958857599361, "grad_norm": 3.5625, "learning_rate": 2.4922605918794532e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1460, "tokens_per_second_per_gpu": 2452.72 }, { "epoch": 0.14679448771719592, "grad_norm": 4.4375, "learning_rate": 2.4921512396487515e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1470, "tokens_per_second_per_gpu": 2671.47 }, { "epoch": 0.14779308967445576, "grad_norm": 3.890625, "learning_rate": 2.4920411227269026e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1480, "tokens_per_second_per_gpu": 2448.97 }, { "epoch": 0.1487916916317156, "grad_norm": 3.75, "learning_rate": 2.4919302411816956e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1490, "tokens_per_second_per_gpu": 2632.79 }, { "epoch": 0.14979029358897544, "grad_norm": 3.640625, "learning_rate": 2.491818595081392e-05, "loss": 0.5488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1500, "tokens_per_second_per_gpu": 2393.86 }, { "epoch": 0.15078889554623526, "grad_norm": 3.671875, "learning_rate": 2.4917061844947233e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1510, "tokens_per_second_per_gpu": 2451.56 }, { "epoch": 0.1517874975034951, "grad_norm": 2.9375, "learning_rate": 2.491593009490891e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1520, "tokens_per_second_per_gpu": 2417.47 }, { "epoch": 0.15278609946075494, "grad_norm": 3.203125, "learning_rate": 2.4914790701395674e-05, "loss": 0.5701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1530, "tokens_per_second_per_gpu": 2478.94 }, { "epoch": 0.15378470141801479, "grad_norm": 3.53125, "learning_rate": 2.4913643665108965e-05, "loss": 0.5149, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1540, "tokens_per_second_per_gpu": 2434.93 }, { "epoch": 0.15478330337527463, "grad_norm": 3.59375, "learning_rate": 2.4912488986754918e-05, "loss": 0.5279, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1550, "tokens_per_second_per_gpu": 2531.95 }, { "epoch": 0.15578190533253444, "grad_norm": 3.375, "learning_rate": 2.4911326667044373e-05, "loss": 0.5147, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1560, "tokens_per_second_per_gpu": 2601.2 }, { "epoch": 0.15678050728979429, "grad_norm": 3.609375, "learning_rate": 2.491015670669287e-05, "loss": 0.5212, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1570, "tokens_per_second_per_gpu": 2487.32 }, { "epoch": 0.15777910924705413, "grad_norm": 3.609375, "learning_rate": 2.4908979106420665e-05, "loss": 0.5706, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1580, "tokens_per_second_per_gpu": 2673.55 }, { "epoch": 0.15877771120431397, "grad_norm": 3.28125, "learning_rate": 2.4907793866952712e-05, "loss": 0.5486, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1590, "tokens_per_second_per_gpu": 2404.47 }, { "epoch": 0.15977631316157379, "grad_norm": 3.390625, "learning_rate": 2.4906600989018657e-05, "loss": 0.5771, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1600, "tokens_per_second_per_gpu": 2514.08 }, { "epoch": 0.16077491511883363, "grad_norm": 3.21875, "learning_rate": 2.4905400473352864e-05, "loss": 0.5454, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1610, "tokens_per_second_per_gpu": 2519.39 }, { "epoch": 0.16177351707609347, "grad_norm": 2.703125, "learning_rate": 2.4904192320694395e-05, "loss": 0.5374, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1620, "tokens_per_second_per_gpu": 2497.08 }, { "epoch": 0.1627721190333533, "grad_norm": 3.421875, "learning_rate": 2.4902976531787003e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1630, "tokens_per_second_per_gpu": 2578.48 }, { "epoch": 0.16377072099061313, "grad_norm": 3.703125, "learning_rate": 2.4901753107379157e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1640, "tokens_per_second_per_gpu": 2447.14 }, { "epoch": 0.16476932294787297, "grad_norm": 3.65625, "learning_rate": 2.490052204822402e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1650, "tokens_per_second_per_gpu": 2496.6 }, { "epoch": 0.1657679249051328, "grad_norm": 2.4375, "learning_rate": 2.489928335507945e-05, "loss": 0.5879, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1660, "tokens_per_second_per_gpu": 2662.82 }, { "epoch": 0.16676652686239266, "grad_norm": 3.1875, "learning_rate": 2.489803702870801e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1670, "tokens_per_second_per_gpu": 2614.24 }, { "epoch": 0.1677651288196525, "grad_norm": 2.9375, "learning_rate": 2.489678306987696e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1680, "tokens_per_second_per_gpu": 2560.69 }, { "epoch": 0.1687637307769123, "grad_norm": 3.015625, "learning_rate": 2.489552147935827e-05, "loss": 0.5447, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1690, "tokens_per_second_per_gpu": 2481.65 }, { "epoch": 0.16976233273417216, "grad_norm": 3.0625, "learning_rate": 2.4894252257928585e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1700, "tokens_per_second_per_gpu": 2515.55 }, { "epoch": 0.170760934691432, "grad_norm": 2.1875, "learning_rate": 2.489297540636927e-05, "loss": 0.4786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1710, "tokens_per_second_per_gpu": 2383.9 }, { "epoch": 0.17175953664869184, "grad_norm": 3.625, "learning_rate": 2.4891690925466372e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1720, "tokens_per_second_per_gpu": 2585.35 }, { "epoch": 0.17275813860595166, "grad_norm": 3.359375, "learning_rate": 2.4890398816010646e-05, "loss": 0.5149, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1730, "tokens_per_second_per_gpu": 2446.56 }, { "epoch": 0.1737567405632115, "grad_norm": 3.984375, "learning_rate": 2.4889099078797536e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1740, "tokens_per_second_per_gpu": 2516.58 }, { "epoch": 0.17475534252047134, "grad_norm": 3.5, "learning_rate": 2.4887791714627187e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1750, "tokens_per_second_per_gpu": 2361.41 }, { "epoch": 0.17575394447773118, "grad_norm": 3.015625, "learning_rate": 2.4886476724304433e-05, "loss": 0.5347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1760, "tokens_per_second_per_gpu": 2347.78 }, { "epoch": 0.17675254643499103, "grad_norm": 3.625, "learning_rate": 2.4885154108638807e-05, "loss": 0.5402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1770, "tokens_per_second_per_gpu": 2549.88 }, { "epoch": 0.17775114839225084, "grad_norm": 3.40625, "learning_rate": 2.4883823868444538e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1780, "tokens_per_second_per_gpu": 2497.43 }, { "epoch": 0.17874975034951068, "grad_norm": 2.984375, "learning_rate": 2.4882486004540547e-05, "loss": 0.5148, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1790, "tokens_per_second_per_gpu": 2737.7 }, { "epoch": 0.17974835230677053, "grad_norm": 3.75, "learning_rate": 2.4881140517750438e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1800, "tokens_per_second_per_gpu": 2394.88 }, { "epoch": 0.18074695426403037, "grad_norm": 2.71875, "learning_rate": 2.4879787408902526e-05, "loss": 0.5466, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1810, "tokens_per_second_per_gpu": 2383.25 }, { "epoch": 0.18174555622129018, "grad_norm": 3.921875, "learning_rate": 2.4878426678829812e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1820, "tokens_per_second_per_gpu": 2401.83 }, { "epoch": 0.18274415817855003, "grad_norm": 3.859375, "learning_rate": 2.4877058328369982e-05, "loss": 0.4986, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1830, "tokens_per_second_per_gpu": 2600.69 }, { "epoch": 0.18374276013580987, "grad_norm": 4.15625, "learning_rate": 2.4875682358365414e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1840, "tokens_per_second_per_gpu": 2398.51 }, { "epoch": 0.1847413620930697, "grad_norm": 3.515625, "learning_rate": 2.487429876966319e-05, "loss": 0.5426, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1850, "tokens_per_second_per_gpu": 2544.1 }, { "epoch": 0.18573996405032953, "grad_norm": 4.1875, "learning_rate": 2.4872907563115065e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1860, "tokens_per_second_per_gpu": 2450.1 }, { "epoch": 0.18673856600758937, "grad_norm": 3.625, "learning_rate": 2.4871508739577493e-05, "loss": 0.5667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1870, "tokens_per_second_per_gpu": 2557.67 }, { "epoch": 0.1877371679648492, "grad_norm": 3.84375, "learning_rate": 2.487010229991162e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1880, "tokens_per_second_per_gpu": 2396.57 }, { "epoch": 0.18873576992210905, "grad_norm": 3.21875, "learning_rate": 2.486868824498327e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1890, "tokens_per_second_per_gpu": 2489.78 }, { "epoch": 0.1897343718793689, "grad_norm": 3.15625, "learning_rate": 2.486726657566296e-05, "loss": 0.5501, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1900, "tokens_per_second_per_gpu": 2461.71 }, { "epoch": 0.1907329738366287, "grad_norm": 2.859375, "learning_rate": 2.48658372928259e-05, "loss": 0.5414, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1910, "tokens_per_second_per_gpu": 2426.67 }, { "epoch": 0.19173157579388855, "grad_norm": 2.828125, "learning_rate": 2.4864400397351987e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1920, "tokens_per_second_per_gpu": 2425.32 }, { "epoch": 0.1927301777511484, "grad_norm": 2.640625, "learning_rate": 2.486295589012579e-05, "loss": 0.5343, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1930, "tokens_per_second_per_gpu": 2601.42 }, { "epoch": 0.19372877970840824, "grad_norm": 4.4375, "learning_rate": 2.4861503772036583e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1940, "tokens_per_second_per_gpu": 2407.81 }, { "epoch": 0.19472738166566805, "grad_norm": 2.75, "learning_rate": 2.4860044043978316e-05, "loss": 0.5227, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1950, "tokens_per_second_per_gpu": 2604.35 }, { "epoch": 0.1957259836229279, "grad_norm": 3.640625, "learning_rate": 2.485857670684962e-05, "loss": 0.5519, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1960, "tokens_per_second_per_gpu": 2780.31 }, { "epoch": 0.19672458558018774, "grad_norm": 4.15625, "learning_rate": 2.485710176155381e-05, "loss": 0.5742, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1970, "tokens_per_second_per_gpu": 2571.81 }, { "epoch": 0.19772318753744758, "grad_norm": 2.375, "learning_rate": 2.4855619208998903e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1980, "tokens_per_second_per_gpu": 2562.79 }, { "epoch": 0.1987217894947074, "grad_norm": 3.0625, "learning_rate": 2.4854129050097573e-05, "loss": 0.5201, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 1990, "tokens_per_second_per_gpu": 2591.12 }, { "epoch": 0.19972039145196724, "grad_norm": 2.953125, "learning_rate": 2.48526312857672e-05, "loss": 0.5301, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2000, "tokens_per_second_per_gpu": 2482.81 }, { "epoch": 0.20071899340922708, "grad_norm": 2.90625, "learning_rate": 2.485112591692983e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2010, "tokens_per_second_per_gpu": 2546.7 }, { "epoch": 0.20171759536648692, "grad_norm": 3.46875, "learning_rate": 2.4849612944512192e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2020, "tokens_per_second_per_gpu": 2591.4 }, { "epoch": 0.20271619732374677, "grad_norm": 3.671875, "learning_rate": 2.4848092369445704e-05, "loss": 0.5653, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2030, "tokens_per_second_per_gpu": 2582.03 }, { "epoch": 0.20371479928100658, "grad_norm": 3.6875, "learning_rate": 2.484656419266646e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2040, "tokens_per_second_per_gpu": 2614.81 }, { "epoch": 0.20471340123826642, "grad_norm": 3.078125, "learning_rate": 2.4845028415115235e-05, "loss": 0.5306, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2050, "tokens_per_second_per_gpu": 2352.58 }, { "epoch": 0.20571200319552627, "grad_norm": 4.25, "learning_rate": 2.4843485037737475e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2060, "tokens_per_second_per_gpu": 2420.53 }, { "epoch": 0.2067106051527861, "grad_norm": 3.25, "learning_rate": 2.4841934061483323e-05, "loss": 0.5581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2070, "tokens_per_second_per_gpu": 2418.01 }, { "epoch": 0.20770920711004592, "grad_norm": 4.0, "learning_rate": 2.4840375487307577e-05, "loss": 0.5648, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2080, "tokens_per_second_per_gpu": 2463.28 }, { "epoch": 0.20870780906730577, "grad_norm": 3.3125, "learning_rate": 2.483880931616973e-05, "loss": 0.5762, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2090, "tokens_per_second_per_gpu": 2549.31 }, { "epoch": 0.2097064110245656, "grad_norm": 3.234375, "learning_rate": 2.4837235549033945e-05, "loss": 0.4791, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2100, "tokens_per_second_per_gpu": 2445.48 }, { "epoch": 0.21070501298182545, "grad_norm": 2.921875, "learning_rate": 2.4835654186869062e-05, "loss": 0.5677, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2110, "tokens_per_second_per_gpu": 2541.8 }, { "epoch": 0.2117036149390853, "grad_norm": 3.09375, "learning_rate": 2.4834065230648597e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2120, "tokens_per_second_per_gpu": 2517.96 }, { "epoch": 0.2127022168963451, "grad_norm": 3.171875, "learning_rate": 2.483246868135074e-05, "loss": 0.5429, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2130, "tokens_per_second_per_gpu": 2587.14 }, { "epoch": 0.21370081885360495, "grad_norm": 3.28125, "learning_rate": 2.4830864539958355e-05, "loss": 0.5612, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2140, "tokens_per_second_per_gpu": 2380.99 }, { "epoch": 0.2146994208108648, "grad_norm": 2.8125, "learning_rate": 2.482925280745898e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2150, "tokens_per_second_per_gpu": 2521.96 }, { "epoch": 0.21569802276812464, "grad_norm": 3.46875, "learning_rate": 2.482763348484483e-05, "loss": 0.5553, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2160, "tokens_per_second_per_gpu": 2334.48 }, { "epoch": 0.21669662472538445, "grad_norm": 3.765625, "learning_rate": 2.482600657311279e-05, "loss": 0.5549, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2170, "tokens_per_second_per_gpu": 2617.39 }, { "epoch": 0.2176952266826443, "grad_norm": 3.546875, "learning_rate": 2.482437207326442e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2180, "tokens_per_second_per_gpu": 2305.99 }, { "epoch": 0.21869382863990414, "grad_norm": 3.546875, "learning_rate": 2.4822729986305938e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2190, "tokens_per_second_per_gpu": 2493.51 }, { "epoch": 0.21969243059716398, "grad_norm": 4.0, "learning_rate": 2.482108031324825e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2200, "tokens_per_second_per_gpu": 2504.39 }, { "epoch": 0.2206910325544238, "grad_norm": 3.8125, "learning_rate": 2.4819423055106925e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2210, "tokens_per_second_per_gpu": 2556.14 }, { "epoch": 0.22168963451168364, "grad_norm": 3.5625, "learning_rate": 2.48177582129022e-05, "loss": 0.5668, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2220, "tokens_per_second_per_gpu": 2562.64 }, { "epoch": 0.22268823646894348, "grad_norm": 3.515625, "learning_rate": 2.4816085787658984e-05, "loss": 0.5064, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2230, "tokens_per_second_per_gpu": 2531.2 }, { "epoch": 0.22368683842620332, "grad_norm": 2.375, "learning_rate": 2.4814405780406848e-05, "loss": 0.5623, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2240, "tokens_per_second_per_gpu": 2504.99 }, { "epoch": 0.22468544038346316, "grad_norm": 3.34375, "learning_rate": 2.4812718192180042e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2250, "tokens_per_second_per_gpu": 2426.11 }, { "epoch": 0.22568404234072298, "grad_norm": 3.546875, "learning_rate": 2.4811023024017473e-05, "loss": 0.5582, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2260, "tokens_per_second_per_gpu": 2468.31 }, { "epoch": 0.22668264429798282, "grad_norm": 2.984375, "learning_rate": 2.4809320276962722e-05, "loss": 0.5309, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2270, "tokens_per_second_per_gpu": 2504.65 }, { "epoch": 0.22768124625524266, "grad_norm": 2.5, "learning_rate": 2.480760995206402e-05, "loss": 0.5235, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2280, "tokens_per_second_per_gpu": 2360.38 }, { "epoch": 0.2286798482125025, "grad_norm": 2.84375, "learning_rate": 2.4805892050374287e-05, "loss": 0.5618, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2290, "tokens_per_second_per_gpu": 2366.07 }, { "epoch": 0.22967845016976232, "grad_norm": 3.34375, "learning_rate": 2.480416657295109e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2300, "tokens_per_second_per_gpu": 2477.47 }, { "epoch": 0.23067705212702216, "grad_norm": 3.234375, "learning_rate": 2.480243352085666e-05, "loss": 0.5606, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2310, "tokens_per_second_per_gpu": 2415.0 }, { "epoch": 0.231675654084282, "grad_norm": 3.03125, "learning_rate": 2.4800692895157906e-05, "loss": 0.5314, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2320, "tokens_per_second_per_gpu": 2506.95 }, { "epoch": 0.23267425604154185, "grad_norm": 3.171875, "learning_rate": 2.479894469692638e-05, "loss": 0.5777, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2330, "tokens_per_second_per_gpu": 2500.61 }, { "epoch": 0.23367285799880166, "grad_norm": 4.78125, "learning_rate": 2.479718892723831e-05, "loss": 0.6, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2340, "tokens_per_second_per_gpu": 2536.16 }, { "epoch": 0.2346714599560615, "grad_norm": 4.28125, "learning_rate": 2.4795425587174574e-05, "loss": 0.5407, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2350, "tokens_per_second_per_gpu": 2492.73 }, { "epoch": 0.23567006191332135, "grad_norm": 3.21875, "learning_rate": 2.479365467782073e-05, "loss": 0.5298, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2360, "tokens_per_second_per_gpu": 2617.39 }, { "epoch": 0.2366686638705812, "grad_norm": 2.859375, "learning_rate": 2.4791876200266968e-05, "loss": 0.5561, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2370, "tokens_per_second_per_gpu": 2614.95 }, { "epoch": 0.23766726582784103, "grad_norm": 4.15625, "learning_rate": 2.4790090155608156e-05, "loss": 0.5084, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2380, "tokens_per_second_per_gpu": 2559.15 }, { "epoch": 0.23866586778510085, "grad_norm": 3.4375, "learning_rate": 2.4788296544943817e-05, "loss": 0.5153, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2390, "tokens_per_second_per_gpu": 2507.4 }, { "epoch": 0.2396644697423607, "grad_norm": 3.453125, "learning_rate": 2.4786495369378133e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2400, "tokens_per_second_per_gpu": 2505.49 }, { "epoch": 0.24066307169962053, "grad_norm": 2.625, "learning_rate": 2.4784686630019937e-05, "loss": 0.5533, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2410, "tokens_per_second_per_gpu": 2484.77 }, { "epoch": 0.24166167365688038, "grad_norm": 2.875, "learning_rate": 2.4782870327982725e-05, "loss": 0.5331, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2420, "tokens_per_second_per_gpu": 2469.52 }, { "epoch": 0.2426602756141402, "grad_norm": 2.828125, "learning_rate": 2.4781046464384642e-05, "loss": 0.5613, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2430, "tokens_per_second_per_gpu": 2516.75 }, { "epoch": 0.24365887757140003, "grad_norm": 3.21875, "learning_rate": 2.47792150403485e-05, "loss": 0.5527, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2440, "tokens_per_second_per_gpu": 2431.34 }, { "epoch": 0.24465747952865988, "grad_norm": 2.921875, "learning_rate": 2.4777376057001745e-05, "loss": 0.581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2450, "tokens_per_second_per_gpu": 2526.71 }, { "epoch": 0.24565608148591972, "grad_norm": 2.609375, "learning_rate": 2.47755295154765e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2460, "tokens_per_second_per_gpu": 2585.06 }, { "epoch": 0.24665468344317956, "grad_norm": 2.515625, "learning_rate": 2.477367541690952e-05, "loss": 0.5701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2470, "tokens_per_second_per_gpu": 2515.99 }, { "epoch": 0.24765328540043938, "grad_norm": 3.171875, "learning_rate": 2.4771813762442235e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2480, "tokens_per_second_per_gpu": 2562.7 }, { "epoch": 0.24865188735769922, "grad_norm": 3.640625, "learning_rate": 2.4769944553220703e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2490, "tokens_per_second_per_gpu": 2521.39 }, { "epoch": 0.24965048931495906, "grad_norm": 3.375, "learning_rate": 2.4768067790395646e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2500, "tokens_per_second_per_gpu": 2517.88 }, { "epoch": 0.2506490912722189, "grad_norm": 3.984375, "learning_rate": 2.476618347512244e-05, "loss": 0.569, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2510, "tokens_per_second_per_gpu": 2493.93 }, { "epoch": 0.2516476932294787, "grad_norm": 3.265625, "learning_rate": 2.4764291608561095e-05, "loss": 0.5648, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2520, "tokens_per_second_per_gpu": 2446.04 }, { "epoch": 0.2526462951867386, "grad_norm": 3.390625, "learning_rate": 2.4762392191876283e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2530, "tokens_per_second_per_gpu": 2451.36 }, { "epoch": 0.2536448971439984, "grad_norm": 3.953125, "learning_rate": 2.476048522623732e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2540, "tokens_per_second_per_gpu": 2344.68 }, { "epoch": 0.2546434991012582, "grad_norm": 3.1875, "learning_rate": 2.475857071281817e-05, "loss": 0.61, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2550, "tokens_per_second_per_gpu": 2457.85 }, { "epoch": 0.2556421010585181, "grad_norm": 4.4375, "learning_rate": 2.475664865279744e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2560, "tokens_per_second_per_gpu": 2481.59 }, { "epoch": 0.2566407030157779, "grad_norm": 4.21875, "learning_rate": 2.475471904735839e-05, "loss": 0.681, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2570, "tokens_per_second_per_gpu": 2360.86 }, { "epoch": 0.2576393049730378, "grad_norm": 4.46875, "learning_rate": 2.4752781897688915e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2580, "tokens_per_second_per_gpu": 2472.65 }, { "epoch": 0.2586379069302976, "grad_norm": 2.53125, "learning_rate": 2.4750837204981565e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2590, "tokens_per_second_per_gpu": 2493.07 }, { "epoch": 0.2596365088875574, "grad_norm": 2.796875, "learning_rate": 2.4748884970433524e-05, "loss": 0.4886, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2600, "tokens_per_second_per_gpu": 2411.8 }, { "epoch": 0.2606351108448173, "grad_norm": 2.59375, "learning_rate": 2.4746925195246624e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2610, "tokens_per_second_per_gpu": 2517.37 }, { "epoch": 0.2616337128020771, "grad_norm": 3.265625, "learning_rate": 2.474495788062734e-05, "loss": 0.579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2620, "tokens_per_second_per_gpu": 2522.44 }, { "epoch": 0.2626323147593369, "grad_norm": 2.75, "learning_rate": 2.4742983027786794e-05, "loss": 0.5023, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2630, "tokens_per_second_per_gpu": 2416.93 }, { "epoch": 0.2636309167165968, "grad_norm": 3.28125, "learning_rate": 2.4741000637940732e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2640, "tokens_per_second_per_gpu": 2394.91 }, { "epoch": 0.2646295186738566, "grad_norm": 3.15625, "learning_rate": 2.473901071230955e-05, "loss": 0.5638, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2650, "tokens_per_second_per_gpu": 2549.4 }, { "epoch": 0.26562812063111646, "grad_norm": 2.703125, "learning_rate": 2.4737013252118285e-05, "loss": 0.5516, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2660, "tokens_per_second_per_gpu": 2653.01 }, { "epoch": 0.2666267225883763, "grad_norm": 3.515625, "learning_rate": 2.4735008258596616e-05, "loss": 0.579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2670, "tokens_per_second_per_gpu": 2433.46 }, { "epoch": 0.2676253245456361, "grad_norm": 4.03125, "learning_rate": 2.4732995732978844e-05, "loss": 0.5879, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2680, "tokens_per_second_per_gpu": 2442.08 }, { "epoch": 0.26862392650289596, "grad_norm": 3.625, "learning_rate": 2.4730975676503926e-05, "loss": 0.5778, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2690, "tokens_per_second_per_gpu": 2478.57 }, { "epoch": 0.2696225284601558, "grad_norm": 3.578125, "learning_rate": 2.472894809041544e-05, "loss": 0.5456, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2700, "tokens_per_second_per_gpu": 2513.98 }, { "epoch": 0.27062113041741565, "grad_norm": 2.78125, "learning_rate": 2.4726912975961602e-05, "loss": 0.5063, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2710, "tokens_per_second_per_gpu": 2464.05 }, { "epoch": 0.27161973237467546, "grad_norm": 2.953125, "learning_rate": 2.4724870334395274e-05, "loss": 0.5402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2720, "tokens_per_second_per_gpu": 2574.62 }, { "epoch": 0.2726183343319353, "grad_norm": 3.0625, "learning_rate": 2.4722820166973943e-05, "loss": 0.5773, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2730, "tokens_per_second_per_gpu": 2676.57 }, { "epoch": 0.27361693628919515, "grad_norm": 3.265625, "learning_rate": 2.472076247495972e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2740, "tokens_per_second_per_gpu": 2632.42 }, { "epoch": 0.27461553824645496, "grad_norm": 3.8125, "learning_rate": 2.4718697259619372e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2750, "tokens_per_second_per_gpu": 2505.39 }, { "epoch": 0.2756141402037148, "grad_norm": 4.0, "learning_rate": 2.4716624522224274e-05, "loss": 0.5692, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2760, "tokens_per_second_per_gpu": 2358.23 }, { "epoch": 0.27661274216097465, "grad_norm": 2.859375, "learning_rate": 2.471454426405044e-05, "loss": 0.5161, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2770, "tokens_per_second_per_gpu": 2587.35 }, { "epoch": 0.27761134411823446, "grad_norm": 3.015625, "learning_rate": 2.4712456486378526e-05, "loss": 0.4979, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2780, "tokens_per_second_per_gpu": 2474.05 }, { "epoch": 0.27860994607549433, "grad_norm": 3.5, "learning_rate": 2.4710361190493796e-05, "loss": 0.5293, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2790, "tokens_per_second_per_gpu": 2613.3 }, { "epoch": 0.27960854803275415, "grad_norm": 3.65625, "learning_rate": 2.4708258377686157e-05, "loss": 0.5265, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2800, "tokens_per_second_per_gpu": 2237.67 }, { "epoch": 0.28060714999001396, "grad_norm": 3.640625, "learning_rate": 2.470614804925014e-05, "loss": 0.5403, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2810, "tokens_per_second_per_gpu": 2387.78 }, { "epoch": 0.28160575194727383, "grad_norm": 3.390625, "learning_rate": 2.4704030206484896e-05, "loss": 0.5103, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2820, "tokens_per_second_per_gpu": 2400.98 }, { "epoch": 0.28260435390453365, "grad_norm": 2.90625, "learning_rate": 2.4701904850694216e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2830, "tokens_per_second_per_gpu": 2402.21 }, { "epoch": 0.2836029558617935, "grad_norm": 3.25, "learning_rate": 2.4699771983186505e-05, "loss": 0.5662, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2840, "tokens_per_second_per_gpu": 2212.44 }, { "epoch": 0.28460155781905333, "grad_norm": 3.0625, "learning_rate": 2.469763160527479e-05, "loss": 0.529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2850, "tokens_per_second_per_gpu": 2409.89 }, { "epoch": 0.28560015977631314, "grad_norm": 3.40625, "learning_rate": 2.4695483718276734e-05, "loss": 0.4944, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2860, "tokens_per_second_per_gpu": 2545.09 }, { "epoch": 0.286598761733573, "grad_norm": 2.921875, "learning_rate": 2.4693328323514615e-05, "loss": 0.5462, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2870, "tokens_per_second_per_gpu": 2623.18 }, { "epoch": 0.28759736369083283, "grad_norm": 3.546875, "learning_rate": 2.4691165422315333e-05, "loss": 0.5415, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2880, "tokens_per_second_per_gpu": 2528.76 }, { "epoch": 0.28859596564809264, "grad_norm": 2.671875, "learning_rate": 2.4688995016010407e-05, "loss": 0.5014, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2890, "tokens_per_second_per_gpu": 2326.85 }, { "epoch": 0.2895945676053525, "grad_norm": 4.28125, "learning_rate": 2.4686817105935983e-05, "loss": 0.628, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2900, "tokens_per_second_per_gpu": 2523.03 }, { "epoch": 0.29059316956261233, "grad_norm": 3.640625, "learning_rate": 2.4684631693432818e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2910, "tokens_per_second_per_gpu": 2497.34 }, { "epoch": 0.2915917715198722, "grad_norm": 4.125, "learning_rate": 2.4682438779846294e-05, "loss": 0.56, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2920, "tokens_per_second_per_gpu": 2362.11 }, { "epoch": 0.292590373477132, "grad_norm": 2.875, "learning_rate": 2.4680238366526415e-05, "loss": 0.5157, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2930, "tokens_per_second_per_gpu": 2361.71 }, { "epoch": 0.29358897543439183, "grad_norm": 3.421875, "learning_rate": 2.4678030454827787e-05, "loss": 0.5326, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2940, "tokens_per_second_per_gpu": 2318.25 }, { "epoch": 0.2945875773916517, "grad_norm": 3.84375, "learning_rate": 2.4675815046109644e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2950, "tokens_per_second_per_gpu": 2476.4 }, { "epoch": 0.2955861793489115, "grad_norm": 3.671875, "learning_rate": 2.4673592141735834e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2960, "tokens_per_second_per_gpu": 2447.05 }, { "epoch": 0.2965847813061714, "grad_norm": 3.375, "learning_rate": 2.4671361743074817e-05, "loss": 0.5322, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2970, "tokens_per_second_per_gpu": 2349.01 }, { "epoch": 0.2975833832634312, "grad_norm": 2.75, "learning_rate": 2.4669123851499664e-05, "loss": 0.5738, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2980, "tokens_per_second_per_gpu": 2490.05 }, { "epoch": 0.298581985220691, "grad_norm": 2.453125, "learning_rate": 2.466687846838807e-05, "loss": 0.5614, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 2990, "tokens_per_second_per_gpu": 2518.59 }, { "epoch": 0.2995805871779509, "grad_norm": 2.984375, "learning_rate": 2.4664625595122326e-05, "loss": 0.4831, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3000, "tokens_per_second_per_gpu": 2548.96 }, { "epoch": 0.3005791891352107, "grad_norm": 3.203125, "learning_rate": 2.4662365233089346e-05, "loss": 0.5516, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3010, "tokens_per_second_per_gpu": 2410.0 }, { "epoch": 0.3015777910924705, "grad_norm": 3.75, "learning_rate": 2.4660097383680653e-05, "loss": 0.5786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3020, "tokens_per_second_per_gpu": 2585.21 }, { "epoch": 0.3025763930497304, "grad_norm": 2.8125, "learning_rate": 2.4657822048292374e-05, "loss": 0.5573, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3030, "tokens_per_second_per_gpu": 2489.88 }, { "epoch": 0.3035749950069902, "grad_norm": 4.28125, "learning_rate": 2.4655539228325244e-05, "loss": 0.5643, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3040, "tokens_per_second_per_gpu": 2352.79 }, { "epoch": 0.30457359696425007, "grad_norm": 2.5625, "learning_rate": 2.4653248925184613e-05, "loss": 0.557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3050, "tokens_per_second_per_gpu": 2514.11 }, { "epoch": 0.3055721989215099, "grad_norm": 3.0625, "learning_rate": 2.4650951140280438e-05, "loss": 0.5692, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3060, "tokens_per_second_per_gpu": 2443.75 }, { "epoch": 0.3065708008787697, "grad_norm": 3.578125, "learning_rate": 2.4648645875027264e-05, "loss": 0.5302, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3070, "tokens_per_second_per_gpu": 2522.86 }, { "epoch": 0.30756940283602957, "grad_norm": 3.8125, "learning_rate": 2.4646333130844268e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3080, "tokens_per_second_per_gpu": 2471.63 }, { "epoch": 0.3085680047932894, "grad_norm": 3.625, "learning_rate": 2.4644012909155214e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3090, "tokens_per_second_per_gpu": 2333.61 }, { "epoch": 0.30956660675054926, "grad_norm": 2.59375, "learning_rate": 2.464168521138847e-05, "loss": 0.5587, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3100, "tokens_per_second_per_gpu": 2481.82 }, { "epoch": 0.31056520870780907, "grad_norm": 2.84375, "learning_rate": 2.4639350038977017e-05, "loss": 0.5658, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3110, "tokens_per_second_per_gpu": 2511.81 }, { "epoch": 0.3115638106650689, "grad_norm": 4.4375, "learning_rate": 2.463700739335842e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3120, "tokens_per_second_per_gpu": 2457.39 }, { "epoch": 0.31256241262232876, "grad_norm": 2.609375, "learning_rate": 2.463465727597486e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3130, "tokens_per_second_per_gpu": 2327.09 }, { "epoch": 0.31356101457958857, "grad_norm": 2.609375, "learning_rate": 2.4632299688273113e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3140, "tokens_per_second_per_gpu": 2621.31 }, { "epoch": 0.3145596165368484, "grad_norm": 3.9375, "learning_rate": 2.4629934631704554e-05, "loss": 0.5405, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3150, "tokens_per_second_per_gpu": 2577.83 }, { "epoch": 0.31555821849410826, "grad_norm": 2.625, "learning_rate": 2.462756210772515e-05, "loss": 0.5581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3160, "tokens_per_second_per_gpu": 2464.7 }, { "epoch": 0.31655682045136807, "grad_norm": 2.96875, "learning_rate": 2.462518211779548e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3170, "tokens_per_second_per_gpu": 2631.32 }, { "epoch": 0.31755542240862794, "grad_norm": 3.625, "learning_rate": 2.4622794663380698e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3180, "tokens_per_second_per_gpu": 2417.53 }, { "epoch": 0.31855402436588776, "grad_norm": 3.578125, "learning_rate": 2.4620399745950573e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3190, "tokens_per_second_per_gpu": 2398.46 }, { "epoch": 0.31955262632314757, "grad_norm": 3.390625, "learning_rate": 2.461799736697946e-05, "loss": 0.5619, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3200, "tokens_per_second_per_gpu": 2374.25 }, { "epoch": 0.32055122828040744, "grad_norm": 2.546875, "learning_rate": 2.46155875279463e-05, "loss": 0.5456, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3210, "tokens_per_second_per_gpu": 2546.56 }, { "epoch": 0.32154983023766726, "grad_norm": 4.03125, "learning_rate": 2.4613170230334647e-05, "loss": 0.546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3220, "tokens_per_second_per_gpu": 2627.21 }, { "epoch": 0.3225484321949271, "grad_norm": 3.5625, "learning_rate": 2.461074547563262e-05, "loss": 0.5568, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3230, "tokens_per_second_per_gpu": 2561.46 }, { "epoch": 0.32354703415218694, "grad_norm": 3.265625, "learning_rate": 2.4608313265332954e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3240, "tokens_per_second_per_gpu": 2709.13 }, { "epoch": 0.32454563610944676, "grad_norm": 2.71875, "learning_rate": 2.4605873600932956e-05, "loss": 0.5327, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3250, "tokens_per_second_per_gpu": 2512.87 }, { "epoch": 0.3255442380667066, "grad_norm": 3.734375, "learning_rate": 2.4603426483934527e-05, "loss": 0.606, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3260, "tokens_per_second_per_gpu": 2635.98 }, { "epoch": 0.32654284002396644, "grad_norm": 3.4375, "learning_rate": 2.4600971915844164e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3270, "tokens_per_second_per_gpu": 2532.57 }, { "epoch": 0.32754144198122626, "grad_norm": 2.609375, "learning_rate": 2.459850989817294e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3280, "tokens_per_second_per_gpu": 2443.84 }, { "epoch": 0.3285400439384861, "grad_norm": 2.3125, "learning_rate": 2.4596040432436516e-05, "loss": 0.5303, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3290, "tokens_per_second_per_gpu": 2570.41 }, { "epoch": 0.32953864589574594, "grad_norm": 2.890625, "learning_rate": 2.4593563520155144e-05, "loss": 0.5508, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3300, "tokens_per_second_per_gpu": 2198.31 }, { "epoch": 0.3305372478530058, "grad_norm": 3.015625, "learning_rate": 2.459107916285366e-05, "loss": 0.5428, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3310, "tokens_per_second_per_gpu": 2444.5 }, { "epoch": 0.3315358498102656, "grad_norm": 3.265625, "learning_rate": 2.458858736206147e-05, "loss": 0.5136, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3320, "tokens_per_second_per_gpu": 2268.29 }, { "epoch": 0.33253445176752544, "grad_norm": 3.46875, "learning_rate": 2.4586088119312582e-05, "loss": 0.5684, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3330, "tokens_per_second_per_gpu": 2519.81 }, { "epoch": 0.3335330537247853, "grad_norm": 2.796875, "learning_rate": 2.458358143614557e-05, "loss": 0.5652, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3340, "tokens_per_second_per_gpu": 2499.16 }, { "epoch": 0.3345316556820451, "grad_norm": 3.171875, "learning_rate": 2.4581067314103602e-05, "loss": 0.5297, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3350, "tokens_per_second_per_gpu": 2536.3 }, { "epoch": 0.335530257639305, "grad_norm": 3.0625, "learning_rate": 2.4578545754734405e-05, "loss": 0.4888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3360, "tokens_per_second_per_gpu": 2358.22 }, { "epoch": 0.3365288595965648, "grad_norm": 2.53125, "learning_rate": 2.4576016759590308e-05, "loss": 0.5815, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3370, "tokens_per_second_per_gpu": 2465.07 }, { "epoch": 0.3375274615538246, "grad_norm": 3.328125, "learning_rate": 2.4573480330228205e-05, "loss": 0.5625, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3380, "tokens_per_second_per_gpu": 2604.7 }, { "epoch": 0.3385260635110845, "grad_norm": 3.171875, "learning_rate": 2.4570936468209565e-05, "loss": 0.5208, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3390, "tokens_per_second_per_gpu": 2298.91 }, { "epoch": 0.3395246654683443, "grad_norm": 3.796875, "learning_rate": 2.456838517510044e-05, "loss": 0.5497, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3400, "tokens_per_second_per_gpu": 2480.21 }, { "epoch": 0.3405232674256042, "grad_norm": 3.140625, "learning_rate": 2.4565826452471447e-05, "loss": 0.5188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3410, "tokens_per_second_per_gpu": 2538.9 }, { "epoch": 0.341521869382864, "grad_norm": 3.578125, "learning_rate": 2.456326030189779e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3420, "tokens_per_second_per_gpu": 2385.96 }, { "epoch": 0.3425204713401238, "grad_norm": 2.5625, "learning_rate": 2.4560686724959235e-05, "loss": 0.5191, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3430, "tokens_per_second_per_gpu": 2397.06 }, { "epoch": 0.3435190732973837, "grad_norm": 2.578125, "learning_rate": 2.4558105723240127e-05, "loss": 0.5515, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3440, "tokens_per_second_per_gpu": 2525.31 }, { "epoch": 0.3445176752546435, "grad_norm": 2.96875, "learning_rate": 2.4555517298329373e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3450, "tokens_per_second_per_gpu": 2382.04 }, { "epoch": 0.3455162772119033, "grad_norm": 3.015625, "learning_rate": 2.4552921451820453e-05, "loss": 0.5418, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3460, "tokens_per_second_per_gpu": 2343.09 }, { "epoch": 0.3465148791691632, "grad_norm": 3.140625, "learning_rate": 2.4550318185311426e-05, "loss": 0.5588, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3470, "tokens_per_second_per_gpu": 2374.68 }, { "epoch": 0.347513481126423, "grad_norm": 3.09375, "learning_rate": 2.4547707500404905e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3480, "tokens_per_second_per_gpu": 2373.24 }, { "epoch": 0.34851208308368287, "grad_norm": 2.640625, "learning_rate": 2.4545089398708078e-05, "loss": 0.5147, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3490, "tokens_per_second_per_gpu": 2367.93 }, { "epoch": 0.3495106850409427, "grad_norm": 3.609375, "learning_rate": 2.4542463881832696e-05, "loss": 0.5253, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3500, "tokens_per_second_per_gpu": 2692.22 }, { "epoch": 0.3505092869982025, "grad_norm": 2.640625, "learning_rate": 2.453983095139507e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3510, "tokens_per_second_per_gpu": 2574.64 }, { "epoch": 0.35150788895546237, "grad_norm": 2.953125, "learning_rate": 2.453719060901609e-05, "loss": 0.563, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3520, "tokens_per_second_per_gpu": 2438.66 }, { "epoch": 0.3525064909127222, "grad_norm": 3.140625, "learning_rate": 2.4534542856321197e-05, "loss": 0.5872, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3530, "tokens_per_second_per_gpu": 2399.74 }, { "epoch": 0.35350509286998205, "grad_norm": 3.609375, "learning_rate": 2.453188769494039e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3540, "tokens_per_second_per_gpu": 2545.87 }, { "epoch": 0.35450369482724187, "grad_norm": 3.53125, "learning_rate": 2.4529225126508244e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3550, "tokens_per_second_per_gpu": 2504.27 }, { "epoch": 0.3555022967845017, "grad_norm": 2.828125, "learning_rate": 2.4526555152663876e-05, "loss": 0.5584, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3560, "tokens_per_second_per_gpu": 2573.94 }, { "epoch": 0.35650089874176155, "grad_norm": 3.515625, "learning_rate": 2.4523877775050974e-05, "loss": 0.5686, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3570, "tokens_per_second_per_gpu": 2386.54 }, { "epoch": 0.35749950069902137, "grad_norm": 3.5, "learning_rate": 2.4521192995317784e-05, "loss": 0.5596, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3580, "tokens_per_second_per_gpu": 2471.82 }, { "epoch": 0.3584981026562812, "grad_norm": 2.96875, "learning_rate": 2.4518500815117107e-05, "loss": 0.5635, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3590, "tokens_per_second_per_gpu": 2419.2 }, { "epoch": 0.35949670461354105, "grad_norm": 2.90625, "learning_rate": 2.4515801236106297e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3600, "tokens_per_second_per_gpu": 2501.18 }, { "epoch": 0.36049530657080087, "grad_norm": 3.203125, "learning_rate": 2.4513094259947257e-05, "loss": 0.5698, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3610, "tokens_per_second_per_gpu": 2637.18 }, { "epoch": 0.36149390852806074, "grad_norm": 3.578125, "learning_rate": 2.4510379888306458e-05, "loss": 0.5449, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3620, "tokens_per_second_per_gpu": 2418.55 }, { "epoch": 0.36249251048532055, "grad_norm": 3.90625, "learning_rate": 2.450765812285492e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3630, "tokens_per_second_per_gpu": 2663.24 }, { "epoch": 0.36349111244258037, "grad_norm": 3.6875, "learning_rate": 2.4504928965268206e-05, "loss": 0.5551, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3640, "tokens_per_second_per_gpu": 2536.7 }, { "epoch": 0.36448971439984024, "grad_norm": 3.265625, "learning_rate": 2.450219241722644e-05, "loss": 0.5185, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3650, "tokens_per_second_per_gpu": 2353.01 }, { "epoch": 0.36548831635710005, "grad_norm": 3.1875, "learning_rate": 2.4499448480414288e-05, "loss": 0.5041, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3660, "tokens_per_second_per_gpu": 2539.95 }, { "epoch": 0.3664869183143599, "grad_norm": 3.25, "learning_rate": 2.449669715652097e-05, "loss": 0.5492, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3670, "tokens_per_second_per_gpu": 2663.58 }, { "epoch": 0.36748552027161974, "grad_norm": 2.359375, "learning_rate": 2.449393844724025e-05, "loss": 0.4435, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3680, "tokens_per_second_per_gpu": 2529.78 }, { "epoch": 0.36848412222887955, "grad_norm": 3.46875, "learning_rate": 2.4491172354270443e-05, "loss": 0.4711, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3690, "tokens_per_second_per_gpu": 2588.91 }, { "epoch": 0.3694827241861394, "grad_norm": 2.96875, "learning_rate": 2.4488398879314405e-05, "loss": 0.5515, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3700, "tokens_per_second_per_gpu": 2383.03 }, { "epoch": 0.37048132614339924, "grad_norm": 2.765625, "learning_rate": 2.4485618024079538e-05, "loss": 0.5084, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3710, "tokens_per_second_per_gpu": 2404.34 }, { "epoch": 0.37147992810065905, "grad_norm": 2.96875, "learning_rate": 2.4482829790277784e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3720, "tokens_per_second_per_gpu": 2450.42 }, { "epoch": 0.3724785300579189, "grad_norm": 3.203125, "learning_rate": 2.4480034179625638e-05, "loss": 0.5742, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3730, "tokens_per_second_per_gpu": 2284.42 }, { "epoch": 0.37347713201517874, "grad_norm": 3.265625, "learning_rate": 2.4477231193844125e-05, "loss": 0.5531, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3740, "tokens_per_second_per_gpu": 2592.06 }, { "epoch": 0.3744757339724386, "grad_norm": 2.453125, "learning_rate": 2.4474420834658814e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3750, "tokens_per_second_per_gpu": 2480.02 }, { "epoch": 0.3754743359296984, "grad_norm": 2.78125, "learning_rate": 2.4471603103799818e-05, "loss": 0.5261, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3760, "tokens_per_second_per_gpu": 2634.12 }, { "epoch": 0.37647293788695824, "grad_norm": 3.328125, "learning_rate": 2.446877800300178e-05, "loss": 0.623, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3770, "tokens_per_second_per_gpu": 2470.14 }, { "epoch": 0.3774715398442181, "grad_norm": 2.96875, "learning_rate": 2.4465945534003887e-05, "loss": 0.5036, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3780, "tokens_per_second_per_gpu": 2491.46 }, { "epoch": 0.3784701418014779, "grad_norm": 2.890625, "learning_rate": 2.4463105698549854e-05, "loss": 0.5372, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3790, "tokens_per_second_per_gpu": 2375.22 }, { "epoch": 0.3794687437587378, "grad_norm": 2.5625, "learning_rate": 2.446025849838794e-05, "loss": 0.5727, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3800, "tokens_per_second_per_gpu": 2521.78 }, { "epoch": 0.3804673457159976, "grad_norm": 2.84375, "learning_rate": 2.4457403935270933e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3810, "tokens_per_second_per_gpu": 2429.02 }, { "epoch": 0.3814659476732574, "grad_norm": 3.90625, "learning_rate": 2.4454542010956157e-05, "loss": 0.5229, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3820, "tokens_per_second_per_gpu": 2310.42 }, { "epoch": 0.3824645496305173, "grad_norm": 3.71875, "learning_rate": 2.4451672727205465e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3830, "tokens_per_second_per_gpu": 2553.33 }, { "epoch": 0.3834631515877771, "grad_norm": 3.890625, "learning_rate": 2.4448796085785235e-05, "loss": 0.513, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3840, "tokens_per_second_per_gpu": 2377.86 }, { "epoch": 0.3844617535450369, "grad_norm": 3.21875, "learning_rate": 2.4445912088466383e-05, "loss": 0.55, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3850, "tokens_per_second_per_gpu": 2443.88 }, { "epoch": 0.3854603555022968, "grad_norm": 2.46875, "learning_rate": 2.444302073702435e-05, "loss": 0.5702, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3860, "tokens_per_second_per_gpu": 2648.06 }, { "epoch": 0.3864589574595566, "grad_norm": 3.46875, "learning_rate": 2.444012203323911e-05, "loss": 0.65, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3870, "tokens_per_second_per_gpu": 2429.26 }, { "epoch": 0.3874575594168165, "grad_norm": 3.5, "learning_rate": 2.4437215978895155e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3880, "tokens_per_second_per_gpu": 2426.38 }, { "epoch": 0.3884561613740763, "grad_norm": 3.453125, "learning_rate": 2.4434302575781508e-05, "loss": 0.487, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3890, "tokens_per_second_per_gpu": 2520.43 }, { "epoch": 0.3894547633313361, "grad_norm": 3.46875, "learning_rate": 2.4431381825691707e-05, "loss": 0.5012, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3900, "tokens_per_second_per_gpu": 2466.26 }, { "epoch": 0.390453365288596, "grad_norm": 2.328125, "learning_rate": 2.442845373042383e-05, "loss": 0.5715, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3910, "tokens_per_second_per_gpu": 2606.85 }, { "epoch": 0.3914519672458558, "grad_norm": 3.40625, "learning_rate": 2.4425518291780452e-05, "loss": 0.5278, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3920, "tokens_per_second_per_gpu": 2637.98 }, { "epoch": 0.39245056920311566, "grad_norm": 3.65625, "learning_rate": 2.4422575511568697e-05, "loss": 0.5401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3930, "tokens_per_second_per_gpu": 2521.92 }, { "epoch": 0.3934491711603755, "grad_norm": 2.75, "learning_rate": 2.441962539160019e-05, "loss": 0.4578, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3940, "tokens_per_second_per_gpu": 2370.27 }, { "epoch": 0.3944477731176353, "grad_norm": 3.6875, "learning_rate": 2.4416667933691075e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3950, "tokens_per_second_per_gpu": 2578.85 }, { "epoch": 0.39544637507489516, "grad_norm": 3.3125, "learning_rate": 2.4413703139662016e-05, "loss": 0.5504, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3960, "tokens_per_second_per_gpu": 2464.95 }, { "epoch": 0.396444977032155, "grad_norm": 3.609375, "learning_rate": 2.4410731011338205e-05, "loss": 0.5592, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3970, "tokens_per_second_per_gpu": 2423.95 }, { "epoch": 0.3974435789894148, "grad_norm": 3.59375, "learning_rate": 2.4407751550549328e-05, "loss": 0.5736, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3980, "tokens_per_second_per_gpu": 2372.9 }, { "epoch": 0.39844218094667466, "grad_norm": 3.5625, "learning_rate": 2.4404764759129605e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 3990, "tokens_per_second_per_gpu": 2473.87 }, { "epoch": 0.3994407829039345, "grad_norm": 2.53125, "learning_rate": 2.4401770638917754e-05, "loss": 0.5468, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4000, "tokens_per_second_per_gpu": 2597.7 }, { "epoch": 0.40043938486119435, "grad_norm": 3.28125, "learning_rate": 2.4398769191757013e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4010, "tokens_per_second_per_gpu": 2362.27 }, { "epoch": 0.40143798681845416, "grad_norm": 4.34375, "learning_rate": 2.4395760419495125e-05, "loss": 0.5754, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4020, "tokens_per_second_per_gpu": 2248.82 }, { "epoch": 0.402436588775714, "grad_norm": 2.78125, "learning_rate": 2.439274432398435e-05, "loss": 0.5514, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4030, "tokens_per_second_per_gpu": 2507.58 }, { "epoch": 0.40343519073297385, "grad_norm": 2.8125, "learning_rate": 2.4389720907081447e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4040, "tokens_per_second_per_gpu": 2335.41 }, { "epoch": 0.40443379269023366, "grad_norm": 3.078125, "learning_rate": 2.438669017064769e-05, "loss": 0.5052, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4050, "tokens_per_second_per_gpu": 2467.85 }, { "epoch": 0.40543239464749353, "grad_norm": 3.46875, "learning_rate": 2.4383652116548857e-05, "loss": 0.5493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4060, "tokens_per_second_per_gpu": 2605.98 }, { "epoch": 0.40643099660475335, "grad_norm": 2.90625, "learning_rate": 2.4380606746655228e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4070, "tokens_per_second_per_gpu": 2334.34 }, { "epoch": 0.40742959856201316, "grad_norm": 3.125, "learning_rate": 2.437755406284159e-05, "loss": 0.5583, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4080, "tokens_per_second_per_gpu": 2321.28 }, { "epoch": 0.40842820051927303, "grad_norm": 3.90625, "learning_rate": 2.437449406698723e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4090, "tokens_per_second_per_gpu": 2549.02 }, { "epoch": 0.40942680247653285, "grad_norm": 4.28125, "learning_rate": 2.4371426760975934e-05, "loss": 0.5474, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4100, "tokens_per_second_per_gpu": 2536.58 }, { "epoch": 0.4104254044337927, "grad_norm": 2.90625, "learning_rate": 2.4368352146696004e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4110, "tokens_per_second_per_gpu": 2479.24 }, { "epoch": 0.41142400639105253, "grad_norm": 2.640625, "learning_rate": 2.4365270226040215e-05, "loss": 0.5162, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4120, "tokens_per_second_per_gpu": 2587.72 }, { "epoch": 0.41242260834831235, "grad_norm": 3.3125, "learning_rate": 2.4362181000905864e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4130, "tokens_per_second_per_gpu": 2513.64 }, { "epoch": 0.4134212103055722, "grad_norm": 2.546875, "learning_rate": 2.435908447319473e-05, "loss": 0.5458, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4140, "tokens_per_second_per_gpu": 2371.01 }, { "epoch": 0.41441981226283203, "grad_norm": 2.78125, "learning_rate": 2.435598064481309e-05, "loss": 0.587, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4150, "tokens_per_second_per_gpu": 2443.3 }, { "epoch": 0.41541841422009185, "grad_norm": 2.609375, "learning_rate": 2.4352869517671728e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4160, "tokens_per_second_per_gpu": 2405.91 }, { "epoch": 0.4164170161773517, "grad_norm": 3.390625, "learning_rate": 2.43497510936859e-05, "loss": 0.5347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4170, "tokens_per_second_per_gpu": 2445.15 }, { "epoch": 0.41741561813461153, "grad_norm": 3.296875, "learning_rate": 2.4346625374775372e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4180, "tokens_per_second_per_gpu": 2506.63 }, { "epoch": 0.4184142200918714, "grad_norm": 2.671875, "learning_rate": 2.4343492362864395e-05, "loss": 0.4689, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4190, "tokens_per_second_per_gpu": 2541.72 }, { "epoch": 0.4194128220491312, "grad_norm": 3.765625, "learning_rate": 2.43403520598817e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4200, "tokens_per_second_per_gpu": 2395.07 }, { "epoch": 0.42041142400639103, "grad_norm": 2.875, "learning_rate": 2.433720446776052e-05, "loss": 0.5039, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4210, "tokens_per_second_per_gpu": 2406.24 }, { "epoch": 0.4214100259636509, "grad_norm": 2.734375, "learning_rate": 2.4334049588438578e-05, "loss": 0.5541, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4220, "tokens_per_second_per_gpu": 2260.91 }, { "epoch": 0.4224086279209107, "grad_norm": 4.0, "learning_rate": 2.433088742385806e-05, "loss": 0.5895, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4230, "tokens_per_second_per_gpu": 2364.37 }, { "epoch": 0.4234072298781706, "grad_norm": 3.171875, "learning_rate": 2.432771797596567e-05, "loss": 0.4817, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4240, "tokens_per_second_per_gpu": 2583.05 }, { "epoch": 0.4244058318354304, "grad_norm": 3.21875, "learning_rate": 2.432454124671257e-05, "loss": 0.4883, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4250, "tokens_per_second_per_gpu": 2388.78 }, { "epoch": 0.4254044337926902, "grad_norm": 2.890625, "learning_rate": 2.4321357238054408e-05, "loss": 0.5259, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4260, "tokens_per_second_per_gpu": 2441.14 }, { "epoch": 0.4264030357499501, "grad_norm": 3.328125, "learning_rate": 2.4318165951951325e-05, "loss": 0.5294, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4270, "tokens_per_second_per_gpu": 2360.56 }, { "epoch": 0.4274016377072099, "grad_norm": 2.75, "learning_rate": 2.431496739036793e-05, "loss": 0.5458, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4280, "tokens_per_second_per_gpu": 2460.33 }, { "epoch": 0.4284002396644697, "grad_norm": 2.6875, "learning_rate": 2.4311761555273326e-05, "loss": 0.5095, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4290, "tokens_per_second_per_gpu": 2250.36 }, { "epoch": 0.4293988416217296, "grad_norm": 3.328125, "learning_rate": 2.4308548448641078e-05, "loss": 0.5322, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4300, "tokens_per_second_per_gpu": 2285.98 }, { "epoch": 0.4303974435789894, "grad_norm": 2.71875, "learning_rate": 2.430532807244923e-05, "loss": 0.551, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4310, "tokens_per_second_per_gpu": 2347.4 }, { "epoch": 0.4313960455362493, "grad_norm": 2.421875, "learning_rate": 2.4302100428680314e-05, "loss": 0.4959, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4320, "tokens_per_second_per_gpu": 2469.96 }, { "epoch": 0.4323946474935091, "grad_norm": 3.046875, "learning_rate": 2.4298865519321317e-05, "loss": 0.5482, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4330, "tokens_per_second_per_gpu": 2174.4 }, { "epoch": 0.4333932494507689, "grad_norm": 3.625, "learning_rate": 2.4295623346363713e-05, "loss": 0.5648, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4340, "tokens_per_second_per_gpu": 2500.3 }, { "epoch": 0.4343918514080288, "grad_norm": 3.734375, "learning_rate": 2.4292373911803447e-05, "loss": 0.5281, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4350, "tokens_per_second_per_gpu": 2461.61 }, { "epoch": 0.4353904533652886, "grad_norm": 3.21875, "learning_rate": 2.4289117217640922e-05, "loss": 0.4731, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4360, "tokens_per_second_per_gpu": 2301.74 }, { "epoch": 0.43638905532254846, "grad_norm": 3.15625, "learning_rate": 2.428585326588103e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4370, "tokens_per_second_per_gpu": 2434.79 }, { "epoch": 0.4373876572798083, "grad_norm": 3.484375, "learning_rate": 2.4282582058533108e-05, "loss": 0.5505, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4380, "tokens_per_second_per_gpu": 2549.33 }, { "epoch": 0.4383862592370681, "grad_norm": 2.921875, "learning_rate": 2.4279303597610982e-05, "loss": 0.5635, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4390, "tokens_per_second_per_gpu": 2540.72 }, { "epoch": 0.43938486119432796, "grad_norm": 3.359375, "learning_rate": 2.4276017885132922e-05, "loss": 0.5223, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4400, "tokens_per_second_per_gpu": 2450.28 }, { "epoch": 0.4403834631515878, "grad_norm": 3.125, "learning_rate": 2.427272492312168e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4410, "tokens_per_second_per_gpu": 2663.91 }, { "epoch": 0.4413820651088476, "grad_norm": 3.1875, "learning_rate": 2.426942471360447e-05, "loss": 0.5381, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4420, "tokens_per_second_per_gpu": 2389.82 }, { "epoch": 0.44238066706610746, "grad_norm": 3.390625, "learning_rate": 2.4266117258612946e-05, "loss": 0.4874, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4430, "tokens_per_second_per_gpu": 2590.38 }, { "epoch": 0.4433792690233673, "grad_norm": 2.453125, "learning_rate": 2.426280256018325e-05, "loss": 0.553, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4440, "tokens_per_second_per_gpu": 2351.79 }, { "epoch": 0.44437787098062714, "grad_norm": 3.0625, "learning_rate": 2.425948062035597e-05, "loss": 0.5712, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4450, "tokens_per_second_per_gpu": 2433.89 }, { "epoch": 0.44537647293788696, "grad_norm": 3.828125, "learning_rate": 2.425615144117615e-05, "loss": 0.5696, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4460, "tokens_per_second_per_gpu": 2414.88 }, { "epoch": 0.4463750748951468, "grad_norm": 2.859375, "learning_rate": 2.4252815024693294e-05, "loss": 0.5249, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4470, "tokens_per_second_per_gpu": 2376.92 }, { "epoch": 0.44737367685240664, "grad_norm": 2.703125, "learning_rate": 2.4249471372961362e-05, "loss": 0.5048, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4480, "tokens_per_second_per_gpu": 2420.72 }, { "epoch": 0.44837227880966646, "grad_norm": 3.484375, "learning_rate": 2.424612048803877e-05, "loss": 0.5668, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4490, "tokens_per_second_per_gpu": 2544.45 }, { "epoch": 0.44937088076692633, "grad_norm": 2.859375, "learning_rate": 2.424276237198838e-05, "loss": 0.5316, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4500, "tokens_per_second_per_gpu": 2420.85 }, { "epoch": 0.45036948272418614, "grad_norm": 2.34375, "learning_rate": 2.4239397026877516e-05, "loss": 0.5216, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4510, "tokens_per_second_per_gpu": 2419.56 }, { "epoch": 0.45136808468144596, "grad_norm": 2.640625, "learning_rate": 2.4236024454777938e-05, "loss": 0.4876, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4520, "tokens_per_second_per_gpu": 2444.45 }, { "epoch": 0.45236668663870583, "grad_norm": 3.140625, "learning_rate": 2.4232644657765874e-05, "loss": 0.5669, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4530, "tokens_per_second_per_gpu": 2443.96 }, { "epoch": 0.45336528859596564, "grad_norm": 2.375, "learning_rate": 2.422925763792198e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4540, "tokens_per_second_per_gpu": 2387.65 }, { "epoch": 0.45436389055322546, "grad_norm": 3.578125, "learning_rate": 2.422586339733137e-05, "loss": 0.5344, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4550, "tokens_per_second_per_gpu": 2422.13 }, { "epoch": 0.45536249251048533, "grad_norm": 3.453125, "learning_rate": 2.4222461938083606e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4560, "tokens_per_second_per_gpu": 2456.26 }, { "epoch": 0.45636109446774514, "grad_norm": 2.59375, "learning_rate": 2.4219053262272678e-05, "loss": 0.5671, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4570, "tokens_per_second_per_gpu": 2528.45 }, { "epoch": 0.457359696425005, "grad_norm": 3.6875, "learning_rate": 2.421563737199704e-05, "loss": 0.5183, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4580, "tokens_per_second_per_gpu": 2358.28 }, { "epoch": 0.45835829838226483, "grad_norm": 3.421875, "learning_rate": 2.4212214269359577e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4590, "tokens_per_second_per_gpu": 2611.44 }, { "epoch": 0.45935690033952464, "grad_norm": 3.546875, "learning_rate": 2.4208783956467606e-05, "loss": 0.5579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4600, "tokens_per_second_per_gpu": 2472.43 }, { "epoch": 0.4603555022967845, "grad_norm": 3.09375, "learning_rate": 2.4205346435432895e-05, "loss": 0.629, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4610, "tokens_per_second_per_gpu": 2377.02 }, { "epoch": 0.46135410425404433, "grad_norm": 2.890625, "learning_rate": 2.420190170837165e-05, "loss": 0.5716, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4620, "tokens_per_second_per_gpu": 2566.22 }, { "epoch": 0.4623527062113042, "grad_norm": 3.484375, "learning_rate": 2.4198449777404497e-05, "loss": 0.5158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4630, "tokens_per_second_per_gpu": 2220.8 }, { "epoch": 0.463351308168564, "grad_norm": 3.3125, "learning_rate": 2.419499064465652e-05, "loss": 0.6126, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4640, "tokens_per_second_per_gpu": 2309.71 }, { "epoch": 0.46434991012582383, "grad_norm": 3.265625, "learning_rate": 2.4191524312257215e-05, "loss": 0.5655, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4650, "tokens_per_second_per_gpu": 2538.15 }, { "epoch": 0.4653485120830837, "grad_norm": 3.90625, "learning_rate": 2.418805078234052e-05, "loss": 0.5663, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4660, "tokens_per_second_per_gpu": 2333.92 }, { "epoch": 0.4663471140403435, "grad_norm": 3.015625, "learning_rate": 2.4184570057044816e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4670, "tokens_per_second_per_gpu": 2502.95 }, { "epoch": 0.46734571599760333, "grad_norm": 4.4375, "learning_rate": 2.418108213851289e-05, "loss": 0.5627, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4680, "tokens_per_second_per_gpu": 2467.4 }, { "epoch": 0.4683443179548632, "grad_norm": 2.875, "learning_rate": 2.4177587028891973e-05, "loss": 0.5177, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4690, "tokens_per_second_per_gpu": 2407.5 }, { "epoch": 0.469342919912123, "grad_norm": 3.1875, "learning_rate": 2.4174084730333724e-05, "loss": 0.4718, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4700, "tokens_per_second_per_gpu": 2512.93 }, { "epoch": 0.4703415218693829, "grad_norm": 2.390625, "learning_rate": 2.417057524499421e-05, "loss": 0.488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4710, "tokens_per_second_per_gpu": 2444.95 }, { "epoch": 0.4713401238266427, "grad_norm": 3.640625, "learning_rate": 2.416705857503394e-05, "loss": 0.5131, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4720, "tokens_per_second_per_gpu": 2330.08 }, { "epoch": 0.4723387257839025, "grad_norm": 2.8125, "learning_rate": 2.4163534722617847e-05, "loss": 0.5353, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4730, "tokens_per_second_per_gpu": 2498.29 }, { "epoch": 0.4733373277411624, "grad_norm": 3.5625, "learning_rate": 2.416000368991527e-05, "loss": 0.5322, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4740, "tokens_per_second_per_gpu": 2445.76 }, { "epoch": 0.4743359296984222, "grad_norm": 3.4375, "learning_rate": 2.4156465479099988e-05, "loss": 0.5458, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4750, "tokens_per_second_per_gpu": 2424.4 }, { "epoch": 0.47533453165568207, "grad_norm": 2.859375, "learning_rate": 2.4152920092350178e-05, "loss": 0.5389, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4760, "tokens_per_second_per_gpu": 2373.17 }, { "epoch": 0.4763331336129419, "grad_norm": 3.171875, "learning_rate": 2.4149367531848453e-05, "loss": 0.5211, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4770, "tokens_per_second_per_gpu": 2600.46 }, { "epoch": 0.4773317355702017, "grad_norm": 2.984375, "learning_rate": 2.4145807799781832e-05, "loss": 0.5741, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4780, "tokens_per_second_per_gpu": 2526.23 }, { "epoch": 0.47833033752746157, "grad_norm": 2.421875, "learning_rate": 2.4142240898341748e-05, "loss": 0.5601, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4790, "tokens_per_second_per_gpu": 2572.8 }, { "epoch": 0.4793289394847214, "grad_norm": 3.015625, "learning_rate": 2.4138666829724056e-05, "loss": 0.5544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4800, "tokens_per_second_per_gpu": 2528.82 }, { "epoch": 0.48032754144198125, "grad_norm": 2.875, "learning_rate": 2.4135085596129016e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4810, "tokens_per_second_per_gpu": 2478.72 }, { "epoch": 0.48132614339924107, "grad_norm": 2.5, "learning_rate": 2.41314971997613e-05, "loss": 0.5668, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4820, "tokens_per_second_per_gpu": 2397.99 }, { "epoch": 0.4823247453565009, "grad_norm": 3.625, "learning_rate": 2.4127901642829996e-05, "loss": 0.5941, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4830, "tokens_per_second_per_gpu": 2474.64 }, { "epoch": 0.48332334731376075, "grad_norm": 3.859375, "learning_rate": 2.412429892754859e-05, "loss": 0.5743, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4840, "tokens_per_second_per_gpu": 2584.3 }, { "epoch": 0.48432194927102057, "grad_norm": 2.984375, "learning_rate": 2.4120689056134982e-05, "loss": 0.5397, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4850, "tokens_per_second_per_gpu": 2359.69 }, { "epoch": 0.4853205512282804, "grad_norm": 3.046875, "learning_rate": 2.4117072030811474e-05, "loss": 0.564, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4860, "tokens_per_second_per_gpu": 2459.11 }, { "epoch": 0.48631915318554025, "grad_norm": 3.140625, "learning_rate": 2.4113447853804776e-05, "loss": 0.5758, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4870, "tokens_per_second_per_gpu": 2461.79 }, { "epoch": 0.48731775514280007, "grad_norm": 2.734375, "learning_rate": 2.4109816527345994e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4880, "tokens_per_second_per_gpu": 2434.32 }, { "epoch": 0.48831635710005994, "grad_norm": 2.90625, "learning_rate": 2.4106178053670643e-05, "loss": 0.5302, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4890, "tokens_per_second_per_gpu": 2362.72 }, { "epoch": 0.48931495905731975, "grad_norm": 4.0625, "learning_rate": 2.410253243501863e-05, "loss": 0.5507, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4900, "tokens_per_second_per_gpu": 2235.29 }, { "epoch": 0.49031356101457957, "grad_norm": 3.34375, "learning_rate": 2.409887967363427e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4910, "tokens_per_second_per_gpu": 2518.31 }, { "epoch": 0.49131216297183944, "grad_norm": 2.96875, "learning_rate": 2.409521977176627e-05, "loss": 0.5617, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4920, "tokens_per_second_per_gpu": 2549.59 }, { "epoch": 0.49231076492909925, "grad_norm": 3.640625, "learning_rate": 2.4091552731667724e-05, "loss": 0.5003, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4930, "tokens_per_second_per_gpu": 2278.58 }, { "epoch": 0.4933093668863591, "grad_norm": 2.90625, "learning_rate": 2.4087878555596137e-05, "loss": 0.4769, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4940, "tokens_per_second_per_gpu": 2406.03 }, { "epoch": 0.49430796884361894, "grad_norm": 2.71875, "learning_rate": 2.4084197245813404e-05, "loss": 0.5552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4950, "tokens_per_second_per_gpu": 2631.77 }, { "epoch": 0.49530657080087875, "grad_norm": 3.34375, "learning_rate": 2.4080508804585798e-05, "loss": 0.4799, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4960, "tokens_per_second_per_gpu": 2549.99 }, { "epoch": 0.4963051727581386, "grad_norm": 4.0625, "learning_rate": 2.407681323418399e-05, "loss": 0.5167, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4970, "tokens_per_second_per_gpu": 2569.42 }, { "epoch": 0.49730377471539844, "grad_norm": 3.296875, "learning_rate": 2.4073110536883054e-05, "loss": 0.5434, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4980, "tokens_per_second_per_gpu": 2485.73 }, { "epoch": 0.49830237667265825, "grad_norm": 4.21875, "learning_rate": 2.4069400714962425e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 4990, "tokens_per_second_per_gpu": 2396.54 }, { "epoch": 0.4993009786299181, "grad_norm": 3.390625, "learning_rate": 2.4065683770705944e-05, "loss": 0.5372, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5000, "tokens_per_second_per_gpu": 2274.09 }, { "epoch": 0.5002995805871779, "grad_norm": 3.453125, "learning_rate": 2.4061959706401828e-05, "loss": 0.4883, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5010, "tokens_per_second_per_gpu": 2333.26 }, { "epoch": 0.5012981825444378, "grad_norm": 3.4375, "learning_rate": 2.4058228524342684e-05, "loss": 0.5563, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5020, "tokens_per_second_per_gpu": 2403.31 }, { "epoch": 0.5022967845016976, "grad_norm": 3.703125, "learning_rate": 2.4054490226825487e-05, "loss": 0.5188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5030, "tokens_per_second_per_gpu": 2215.03 }, { "epoch": 0.5032953864589574, "grad_norm": 3.765625, "learning_rate": 2.405074481615161e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5040, "tokens_per_second_per_gpu": 2538.45 }, { "epoch": 0.5042939884162173, "grad_norm": 3.328125, "learning_rate": 2.4046992294626797e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5050, "tokens_per_second_per_gpu": 2375.83 }, { "epoch": 0.5052925903734772, "grad_norm": 3.640625, "learning_rate": 2.4043232664561164e-05, "loss": 0.5268, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5060, "tokens_per_second_per_gpu": 2412.54 }, { "epoch": 0.5062911923307369, "grad_norm": 2.53125, "learning_rate": 2.403946592826921e-05, "loss": 0.5084, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5070, "tokens_per_second_per_gpu": 2450.83 }, { "epoch": 0.5072897942879968, "grad_norm": 3.265625, "learning_rate": 2.4035692088069813e-05, "loss": 0.5108, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5080, "tokens_per_second_per_gpu": 2333.46 }, { "epoch": 0.5082883962452567, "grad_norm": 3.453125, "learning_rate": 2.4031911146286208e-05, "loss": 0.59, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5090, "tokens_per_second_per_gpu": 2426.0 }, { "epoch": 0.5092869982025164, "grad_norm": 3.234375, "learning_rate": 2.4028123105246016e-05, "loss": 0.5588, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5100, "tokens_per_second_per_gpu": 2474.85 }, { "epoch": 0.5102856001597763, "grad_norm": 3.125, "learning_rate": 2.402432796728123e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5110, "tokens_per_second_per_gpu": 2506.2 }, { "epoch": 0.5112842021170362, "grad_norm": 2.8125, "learning_rate": 2.4020525734728206e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5120, "tokens_per_second_per_gpu": 2355.71 }, { "epoch": 0.5122828040742959, "grad_norm": 3.625, "learning_rate": 2.401671640992766e-05, "loss": 0.5221, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5130, "tokens_per_second_per_gpu": 2426.5 }, { "epoch": 0.5132814060315558, "grad_norm": 4.40625, "learning_rate": 2.401289999522469e-05, "loss": 0.5342, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5140, "tokens_per_second_per_gpu": 2431.84 }, { "epoch": 0.5142800079888157, "grad_norm": 3.515625, "learning_rate": 2.4009076492968748e-05, "loss": 0.5455, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5150, "tokens_per_second_per_gpu": 2411.93 }, { "epoch": 0.5152786099460755, "grad_norm": 2.5625, "learning_rate": 2.4005245905513655e-05, "loss": 0.5442, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5160, "tokens_per_second_per_gpu": 2429.42 }, { "epoch": 0.5162772119033353, "grad_norm": 3.640625, "learning_rate": 2.400140823521759e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5170, "tokens_per_second_per_gpu": 2337.28 }, { "epoch": 0.5172758138605952, "grad_norm": 2.78125, "learning_rate": 2.399756348444309e-05, "loss": 0.598, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5180, "tokens_per_second_per_gpu": 2434.73 }, { "epoch": 0.518274415817855, "grad_norm": 3.484375, "learning_rate": 2.399371165555706e-05, "loss": 0.5828, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5190, "tokens_per_second_per_gpu": 2276.22 }, { "epoch": 0.5192730177751148, "grad_norm": 3.015625, "learning_rate": 2.398985275093075e-05, "loss": 0.5385, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5200, "tokens_per_second_per_gpu": 2463.48 }, { "epoch": 0.5202716197323747, "grad_norm": 3.84375, "learning_rate": 2.398598677293978e-05, "loss": 0.5646, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5210, "tokens_per_second_per_gpu": 2286.72 }, { "epoch": 0.5212702216896345, "grad_norm": 2.609375, "learning_rate": 2.3982113723964113e-05, "loss": 0.5604, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5220, "tokens_per_second_per_gpu": 2479.19 }, { "epoch": 0.5222688236468943, "grad_norm": 3.6875, "learning_rate": 2.397823360638807e-05, "loss": 0.5133, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5230, "tokens_per_second_per_gpu": 2443.43 }, { "epoch": 0.5232674256041542, "grad_norm": 3.21875, "learning_rate": 2.397434642260032e-05, "loss": 0.5446, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5240, "tokens_per_second_per_gpu": 2366.96 }, { "epoch": 0.524266027561414, "grad_norm": 3.65625, "learning_rate": 2.3970452174993885e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5250, "tokens_per_second_per_gpu": 2459.65 }, { "epoch": 0.5252646295186738, "grad_norm": 3.109375, "learning_rate": 2.396655086596614e-05, "loss": 0.5185, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5260, "tokens_per_second_per_gpu": 2486.66 }, { "epoch": 0.5262632314759337, "grad_norm": 4.0, "learning_rate": 2.3962642497918802e-05, "loss": 0.7091, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5270, "tokens_per_second_per_gpu": 2393.16 }, { "epoch": 0.5272618334331935, "grad_norm": 2.015625, "learning_rate": 2.3958727073257937e-05, "loss": 0.5092, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5280, "tokens_per_second_per_gpu": 2549.0 }, { "epoch": 0.5282604353904534, "grad_norm": 3.0, "learning_rate": 2.3954804594393943e-05, "loss": 0.5459, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5290, "tokens_per_second_per_gpu": 2448.94 }, { "epoch": 0.5292590373477132, "grad_norm": 3.140625, "learning_rate": 2.3950875063741573e-05, "loss": 0.5546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5300, "tokens_per_second_per_gpu": 2529.08 }, { "epoch": 0.530257639304973, "grad_norm": 2.265625, "learning_rate": 2.3946938483719923e-05, "loss": 0.5281, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5310, "tokens_per_second_per_gpu": 2448.53 }, { "epoch": 0.5312562412622329, "grad_norm": 3.421875, "learning_rate": 2.394299485675242e-05, "loss": 0.5782, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5320, "tokens_per_second_per_gpu": 2392.17 }, { "epoch": 0.5322548432194927, "grad_norm": 4.21875, "learning_rate": 2.3939044185266837e-05, "loss": 0.5397, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5330, "tokens_per_second_per_gpu": 2232.2 }, { "epoch": 0.5332534451767525, "grad_norm": 3.09375, "learning_rate": 2.393508647169528e-05, "loss": 0.5584, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5340, "tokens_per_second_per_gpu": 2289.8 }, { "epoch": 0.5342520471340124, "grad_norm": 4.0, "learning_rate": 2.3931121718474182e-05, "loss": 0.5214, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5350, "tokens_per_second_per_gpu": 2614.37 }, { "epoch": 0.5352506490912722, "grad_norm": 3.296875, "learning_rate": 2.3927149928044328e-05, "loss": 0.5758, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5360, "tokens_per_second_per_gpu": 2450.7 }, { "epoch": 0.536249251048532, "grad_norm": 3.15625, "learning_rate": 2.392317110285082e-05, "loss": 0.5795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5370, "tokens_per_second_per_gpu": 2441.27 }, { "epoch": 0.5372478530057919, "grad_norm": 2.984375, "learning_rate": 2.3919185245343095e-05, "loss": 0.5365, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5380, "tokens_per_second_per_gpu": 2465.43 }, { "epoch": 0.5382464549630517, "grad_norm": 3.359375, "learning_rate": 2.3915192357974927e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5390, "tokens_per_second_per_gpu": 2501.65 }, { "epoch": 0.5392450569203115, "grad_norm": 4.03125, "learning_rate": 2.3911192443204407e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5400, "tokens_per_second_per_gpu": 2271.73 }, { "epoch": 0.5402436588775714, "grad_norm": 3.125, "learning_rate": 2.390718550349395e-05, "loss": 0.5488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5410, "tokens_per_second_per_gpu": 2594.14 }, { "epoch": 0.5412422608348313, "grad_norm": 4.0, "learning_rate": 2.390317154131031e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5420, "tokens_per_second_per_gpu": 2421.96 }, { "epoch": 0.542240862792091, "grad_norm": 3.171875, "learning_rate": 2.3899150559124554e-05, "loss": 0.5541, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5430, "tokens_per_second_per_gpu": 2476.47 }, { "epoch": 0.5432394647493509, "grad_norm": 3.3125, "learning_rate": 2.389512255941207e-05, "loss": 0.5335, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5440, "tokens_per_second_per_gpu": 2410.69 }, { "epoch": 0.5442380667066108, "grad_norm": 3.046875, "learning_rate": 2.3891087544652573e-05, "loss": 0.5144, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5450, "tokens_per_second_per_gpu": 2543.43 }, { "epoch": 0.5452366686638705, "grad_norm": 3.859375, "learning_rate": 2.3887045517330085e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5460, "tokens_per_second_per_gpu": 2364.21 }, { "epoch": 0.5462352706211304, "grad_norm": 3.0625, "learning_rate": 2.388299647993296e-05, "loss": 0.5521, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5470, "tokens_per_second_per_gpu": 2478.25 }, { "epoch": 0.5472338725783903, "grad_norm": 3.015625, "learning_rate": 2.387894043495386e-05, "loss": 0.5718, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5480, "tokens_per_second_per_gpu": 2283.55 }, { "epoch": 0.54823247453565, "grad_norm": 3.4375, "learning_rate": 2.3874877384889756e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5490, "tokens_per_second_per_gpu": 2275.5 }, { "epoch": 0.5492310764929099, "grad_norm": 3.640625, "learning_rate": 2.387080733224194e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5500, "tokens_per_second_per_gpu": 2620.18 }, { "epoch": 0.5502296784501698, "grad_norm": 3.359375, "learning_rate": 2.3866730279516006e-05, "loss": 0.5682, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5510, "tokens_per_second_per_gpu": 2320.04 }, { "epoch": 0.5512282804074295, "grad_norm": 2.90625, "learning_rate": 2.3862646229221867e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5520, "tokens_per_second_per_gpu": 2584.93 }, { "epoch": 0.5522268823646894, "grad_norm": 3.9375, "learning_rate": 2.385855518387374e-05, "loss": 0.5627, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5530, "tokens_per_second_per_gpu": 2356.73 }, { "epoch": 0.5532254843219493, "grad_norm": 3.25, "learning_rate": 2.3854457145990146e-05, "loss": 0.5535, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5540, "tokens_per_second_per_gpu": 2260.76 }, { "epoch": 0.5542240862792092, "grad_norm": 2.875, "learning_rate": 2.385035211809391e-05, "loss": 0.5486, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5550, "tokens_per_second_per_gpu": 2326.77 }, { "epoch": 0.5552226882364689, "grad_norm": 2.625, "learning_rate": 2.384624010271217e-05, "loss": 0.5493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5560, "tokens_per_second_per_gpu": 2385.14 }, { "epoch": 0.5562212901937288, "grad_norm": 3.984375, "learning_rate": 2.384212110237635e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5570, "tokens_per_second_per_gpu": 2473.04 }, { "epoch": 0.5572198921509887, "grad_norm": 4.28125, "learning_rate": 2.3837995119622185e-05, "loss": 0.546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5580, "tokens_per_second_per_gpu": 2410.71 }, { "epoch": 0.5582184941082484, "grad_norm": 3.484375, "learning_rate": 2.383386215698971e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5590, "tokens_per_second_per_gpu": 2304.25 }, { "epoch": 0.5592170960655083, "grad_norm": 3.578125, "learning_rate": 2.3829722217023255e-05, "loss": 0.5402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5600, "tokens_per_second_per_gpu": 2349.77 }, { "epoch": 0.5602156980227682, "grad_norm": 3.796875, "learning_rate": 2.382557530227143e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5610, "tokens_per_second_per_gpu": 2402.1 }, { "epoch": 0.5612142999800279, "grad_norm": 3.515625, "learning_rate": 2.3821421415287162e-05, "loss": 0.524, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5620, "tokens_per_second_per_gpu": 2476.49 }, { "epoch": 0.5622129019372878, "grad_norm": 2.859375, "learning_rate": 2.3817260558627658e-05, "loss": 0.5359, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5630, "tokens_per_second_per_gpu": 2464.38 }, { "epoch": 0.5632115038945477, "grad_norm": 2.90625, "learning_rate": 2.381309273485442e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5640, "tokens_per_second_per_gpu": 2383.39 }, { "epoch": 0.5642101058518074, "grad_norm": 2.78125, "learning_rate": 2.380891794653323e-05, "loss": 0.5132, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5650, "tokens_per_second_per_gpu": 2358.67 }, { "epoch": 0.5652087078090673, "grad_norm": 3.34375, "learning_rate": 2.3804736196234177e-05, "loss": 0.5258, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5660, "tokens_per_second_per_gpu": 2580.38 }, { "epoch": 0.5662073097663272, "grad_norm": 3.0, "learning_rate": 2.3800547486531614e-05, "loss": 0.5578, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5670, "tokens_per_second_per_gpu": 2421.23 }, { "epoch": 0.567205911723587, "grad_norm": 3.296875, "learning_rate": 2.3796351820004188e-05, "loss": 0.5441, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5680, "tokens_per_second_per_gpu": 2427.33 }, { "epoch": 0.5682045136808468, "grad_norm": 3.59375, "learning_rate": 2.379214919923483e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5690, "tokens_per_second_per_gpu": 2168.69 }, { "epoch": 0.5692031156381067, "grad_norm": 2.90625, "learning_rate": 2.378793962681075e-05, "loss": 0.5647, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5700, "tokens_per_second_per_gpu": 2451.56 }, { "epoch": 0.5702017175953665, "grad_norm": 3.40625, "learning_rate": 2.378372310532344e-05, "loss": 0.5448, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5710, "tokens_per_second_per_gpu": 2396.81 }, { "epoch": 0.5712003195526263, "grad_norm": 3.8125, "learning_rate": 2.377949963736867e-05, "loss": 0.5739, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5720, "tokens_per_second_per_gpu": 2300.7 }, { "epoch": 0.5721989215098862, "grad_norm": 3.421875, "learning_rate": 2.3775269225546477e-05, "loss": 0.5776, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5730, "tokens_per_second_per_gpu": 2476.96 }, { "epoch": 0.573197523467146, "grad_norm": 2.828125, "learning_rate": 2.3771031872461186e-05, "loss": 0.5555, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5740, "tokens_per_second_per_gpu": 2466.16 }, { "epoch": 0.5741961254244058, "grad_norm": 2.859375, "learning_rate": 2.376678758072139e-05, "loss": 0.551, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5750, "tokens_per_second_per_gpu": 2492.07 }, { "epoch": 0.5751947273816657, "grad_norm": 3.15625, "learning_rate": 2.376253635293995e-05, "loss": 0.5041, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5760, "tokens_per_second_per_gpu": 2523.54 }, { "epoch": 0.5761933293389255, "grad_norm": 3.328125, "learning_rate": 2.3758278191734004e-05, "loss": 0.5486, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5770, "tokens_per_second_per_gpu": 2361.79 }, { "epoch": 0.5771919312961853, "grad_norm": 2.84375, "learning_rate": 2.3754013099724952e-05, "loss": 0.5194, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5780, "tokens_per_second_per_gpu": 2388.88 }, { "epoch": 0.5781905332534452, "grad_norm": 2.53125, "learning_rate": 2.3749741079538463e-05, "loss": 0.5631, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5790, "tokens_per_second_per_gpu": 2678.98 }, { "epoch": 0.579189135210705, "grad_norm": 2.6875, "learning_rate": 2.3745462133804468e-05, "loss": 0.5004, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5800, "tokens_per_second_per_gpu": 2481.67 }, { "epoch": 0.5801877371679649, "grad_norm": 2.53125, "learning_rate": 2.374117626515717e-05, "loss": 0.4748, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5810, "tokens_per_second_per_gpu": 2458.91 }, { "epoch": 0.5811863391252247, "grad_norm": 3.40625, "learning_rate": 2.3736883476235025e-05, "loss": 0.4887, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5820, "tokens_per_second_per_gpu": 2547.53 }, { "epoch": 0.5821849410824845, "grad_norm": 2.875, "learning_rate": 2.3732583769680754e-05, "loss": 0.5811, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5830, "tokens_per_second_per_gpu": 2409.01 }, { "epoch": 0.5831835430397444, "grad_norm": 4.0625, "learning_rate": 2.3728277148141338e-05, "loss": 0.5326, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5840, "tokens_per_second_per_gpu": 2469.02 }, { "epoch": 0.5841821449970042, "grad_norm": 3.53125, "learning_rate": 2.372396361426801e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5850, "tokens_per_second_per_gpu": 2370.35 }, { "epoch": 0.585180746954264, "grad_norm": 3.296875, "learning_rate": 2.371964317071626e-05, "loss": 0.5683, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5860, "tokens_per_second_per_gpu": 2452.34 }, { "epoch": 0.5861793489115239, "grad_norm": 2.75, "learning_rate": 2.3715315820145835e-05, "loss": 0.503, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5870, "tokens_per_second_per_gpu": 2460.97 }, { "epoch": 0.5871779508687837, "grad_norm": 3.03125, "learning_rate": 2.3710981565220727e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5880, "tokens_per_second_per_gpu": 2396.15 }, { "epoch": 0.5881765528260435, "grad_norm": 2.390625, "learning_rate": 2.370664040860919e-05, "loss": 0.5046, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5890, "tokens_per_second_per_gpu": 2427.32 }, { "epoch": 0.5891751547833034, "grad_norm": 3.796875, "learning_rate": 2.370229235298371e-05, "loss": 0.5362, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5900, "tokens_per_second_per_gpu": 2388.97 }, { "epoch": 0.5901737567405632, "grad_norm": 4.0, "learning_rate": 2.369793740102104e-05, "loss": 0.5232, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5910, "tokens_per_second_per_gpu": 2517.35 }, { "epoch": 0.591172358697823, "grad_norm": 2.84375, "learning_rate": 2.3693575555402164e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5920, "tokens_per_second_per_gpu": 2574.84 }, { "epoch": 0.5921709606550829, "grad_norm": 3.4375, "learning_rate": 2.3689206818812317e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5930, "tokens_per_second_per_gpu": 2438.65 }, { "epoch": 0.5931695626123428, "grad_norm": 3.421875, "learning_rate": 2.3684831193940965e-05, "loss": 0.5366, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5940, "tokens_per_second_per_gpu": 2452.1 }, { "epoch": 0.5941681645696025, "grad_norm": 2.875, "learning_rate": 2.3680448683481835e-05, "loss": 0.5303, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5950, "tokens_per_second_per_gpu": 2484.88 }, { "epoch": 0.5951667665268624, "grad_norm": 2.75, "learning_rate": 2.3676059290132874e-05, "loss": 0.5421, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5960, "tokens_per_second_per_gpu": 2506.0 }, { "epoch": 0.5961653684841223, "grad_norm": 3.046875, "learning_rate": 2.3671663016596273e-05, "loss": 0.5615, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5970, "tokens_per_second_per_gpu": 2361.73 }, { "epoch": 0.597163970441382, "grad_norm": 3.0625, "learning_rate": 2.3667259865578463e-05, "loss": 0.5055, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5980, "tokens_per_second_per_gpu": 2644.14 }, { "epoch": 0.5981625723986419, "grad_norm": 3.875, "learning_rate": 2.36628498397901e-05, "loss": 0.5696, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 5990, "tokens_per_second_per_gpu": 2531.78 }, { "epoch": 0.5991611743559018, "grad_norm": 3.03125, "learning_rate": 2.3658432941946084e-05, "loss": 0.5633, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6000, "tokens_per_second_per_gpu": 2540.37 }, { "epoch": 0.6001597763131615, "grad_norm": 2.171875, "learning_rate": 2.3654009174765532e-05, "loss": 0.4821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6010, "tokens_per_second_per_gpu": 2310.62 }, { "epoch": 0.6011583782704214, "grad_norm": 3.28125, "learning_rate": 2.36495785409718e-05, "loss": 0.5541, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6020, "tokens_per_second_per_gpu": 2425.51 }, { "epoch": 0.6021569802276813, "grad_norm": 2.875, "learning_rate": 2.364514104329246e-05, "loss": 0.4777, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6030, "tokens_per_second_per_gpu": 2557.81 }, { "epoch": 0.603155582184941, "grad_norm": 2.703125, "learning_rate": 2.364069668445933e-05, "loss": 0.5474, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6040, "tokens_per_second_per_gpu": 2283.58 }, { "epoch": 0.6041541841422009, "grad_norm": 2.453125, "learning_rate": 2.3636245467208428e-05, "loss": 0.5488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6050, "tokens_per_second_per_gpu": 2460.04 }, { "epoch": 0.6051527860994608, "grad_norm": 3.5, "learning_rate": 2.3631787394280007e-05, "loss": 0.5247, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6060, "tokens_per_second_per_gpu": 2518.49 }, { "epoch": 0.6061513880567206, "grad_norm": 3.5625, "learning_rate": 2.3627322468418547e-05, "loss": 0.52, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6070, "tokens_per_second_per_gpu": 2503.82 }, { "epoch": 0.6071499900139804, "grad_norm": 3.234375, "learning_rate": 2.362285069237273e-05, "loss": 0.588, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6080, "tokens_per_second_per_gpu": 2571.65 }, { "epoch": 0.6081485919712403, "grad_norm": 3.0625, "learning_rate": 2.3618372068895465e-05, "loss": 0.5022, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6090, "tokens_per_second_per_gpu": 2359.77 }, { "epoch": 0.6091471939285001, "grad_norm": 2.640625, "learning_rate": 2.361388660074388e-05, "loss": 0.5529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6100, "tokens_per_second_per_gpu": 2441.49 }, { "epoch": 0.6101457958857599, "grad_norm": 3.25, "learning_rate": 2.3609394290679303e-05, "loss": 0.5274, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6110, "tokens_per_second_per_gpu": 2389.13 }, { "epoch": 0.6111443978430198, "grad_norm": 2.640625, "learning_rate": 2.3604895141467294e-05, "loss": 0.5642, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6120, "tokens_per_second_per_gpu": 2468.79 }, { "epoch": 0.6121429998002796, "grad_norm": 2.453125, "learning_rate": 2.36003891558776e-05, "loss": 0.5241, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6130, "tokens_per_second_per_gpu": 2412.56 }, { "epoch": 0.6131416017575394, "grad_norm": 2.671875, "learning_rate": 2.35958763366842e-05, "loss": 0.539, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6140, "tokens_per_second_per_gpu": 2438.1 }, { "epoch": 0.6141402037147993, "grad_norm": 3.59375, "learning_rate": 2.3591356686665255e-05, "loss": 0.5303, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6150, "tokens_per_second_per_gpu": 2391.96 }, { "epoch": 0.6151388056720591, "grad_norm": 3.015625, "learning_rate": 2.3586830208603156e-05, "loss": 0.5435, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6160, "tokens_per_second_per_gpu": 2486.14 }, { "epoch": 0.6161374076293189, "grad_norm": 3.84375, "learning_rate": 2.358229690528448e-05, "loss": 0.5236, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6170, "tokens_per_second_per_gpu": 2406.55 }, { "epoch": 0.6171360095865788, "grad_norm": 3.078125, "learning_rate": 2.357775677950002e-05, "loss": 0.5432, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6180, "tokens_per_second_per_gpu": 2499.59 }, { "epoch": 0.6181346115438386, "grad_norm": 3.25, "learning_rate": 2.357320983404475e-05, "loss": 0.535, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6190, "tokens_per_second_per_gpu": 2267.46 }, { "epoch": 0.6191332135010985, "grad_norm": 2.078125, "learning_rate": 2.356865607171786e-05, "loss": 0.4825, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6200, "tokens_per_second_per_gpu": 2329.38 }, { "epoch": 0.6201318154583583, "grad_norm": 4.03125, "learning_rate": 2.3564095495322726e-05, "loss": 0.5419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6210, "tokens_per_second_per_gpu": 2427.66 }, { "epoch": 0.6211304174156181, "grad_norm": 3.546875, "learning_rate": 2.3559528107666927e-05, "loss": 0.5854, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6220, "tokens_per_second_per_gpu": 2739.51 }, { "epoch": 0.622129019372878, "grad_norm": 3.25, "learning_rate": 2.3554953911562233e-05, "loss": 0.5563, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6230, "tokens_per_second_per_gpu": 2437.26 }, { "epoch": 0.6231276213301378, "grad_norm": 3.0, "learning_rate": 2.3550372909824595e-05, "loss": 0.5111, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6240, "tokens_per_second_per_gpu": 2474.2 }, { "epoch": 0.6241262232873976, "grad_norm": 2.65625, "learning_rate": 2.3545785105274164e-05, "loss": 0.5278, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6250, "tokens_per_second_per_gpu": 2473.32 }, { "epoch": 0.6251248252446575, "grad_norm": 2.796875, "learning_rate": 2.3541190500735284e-05, "loss": 0.5701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6260, "tokens_per_second_per_gpu": 2145.78 }, { "epoch": 0.6261234272019173, "grad_norm": 2.65625, "learning_rate": 2.3536589099036472e-05, "loss": 0.5337, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6270, "tokens_per_second_per_gpu": 2503.2 }, { "epoch": 0.6271220291591771, "grad_norm": 3.0625, "learning_rate": 2.3531980903010434e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6280, "tokens_per_second_per_gpu": 2354.88 }, { "epoch": 0.628120631116437, "grad_norm": 3.421875, "learning_rate": 2.3527365915494065e-05, "loss": 0.5665, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6290, "tokens_per_second_per_gpu": 2313.86 }, { "epoch": 0.6291192330736968, "grad_norm": 2.4375, "learning_rate": 2.352274413932843e-05, "loss": 0.4569, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6300, "tokens_per_second_per_gpu": 2415.36 }, { "epoch": 0.6301178350309566, "grad_norm": 3.21875, "learning_rate": 2.3518115577358787e-05, "loss": 0.5335, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6310, "tokens_per_second_per_gpu": 2356.65 }, { "epoch": 0.6311164369882165, "grad_norm": 3.5, "learning_rate": 2.3513480232434558e-05, "loss": 0.5182, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6320, "tokens_per_second_per_gpu": 2441.75 }, { "epoch": 0.6321150389454764, "grad_norm": 3.390625, "learning_rate": 2.350883810740935e-05, "loss": 0.5574, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6330, "tokens_per_second_per_gpu": 2175.82 }, { "epoch": 0.6331136409027361, "grad_norm": 3.078125, "learning_rate": 2.3504189205140942e-05, "loss": 0.5581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6340, "tokens_per_second_per_gpu": 2490.39 }, { "epoch": 0.634112242859996, "grad_norm": 3.453125, "learning_rate": 2.3499533528491274e-05, "loss": 0.5094, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6350, "tokens_per_second_per_gpu": 2165.91 }, { "epoch": 0.6351108448172559, "grad_norm": 3.0625, "learning_rate": 2.349487108032648e-05, "loss": 0.5649, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6360, "tokens_per_second_per_gpu": 2519.19 }, { "epoch": 0.6361094467745156, "grad_norm": 2.890625, "learning_rate": 2.349020186351684e-05, "loss": 0.5346, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6370, "tokens_per_second_per_gpu": 2634.85 }, { "epoch": 0.6371080487317755, "grad_norm": 3.578125, "learning_rate": 2.348552588093681e-05, "loss": 0.522, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6380, "tokens_per_second_per_gpu": 2210.78 }, { "epoch": 0.6381066506890354, "grad_norm": 3.078125, "learning_rate": 2.3480843135465015e-05, "loss": 0.5757, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6390, "tokens_per_second_per_gpu": 2360.44 }, { "epoch": 0.6391052526462951, "grad_norm": 2.96875, "learning_rate": 2.3476153629984236e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6400, "tokens_per_second_per_gpu": 2294.92 }, { "epoch": 0.640103854603555, "grad_norm": 2.90625, "learning_rate": 2.3471457367381418e-05, "loss": 0.564, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6410, "tokens_per_second_per_gpu": 2408.76 }, { "epoch": 0.6411024565608149, "grad_norm": 3.328125, "learning_rate": 2.346675435054767e-05, "loss": 0.5139, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6420, "tokens_per_second_per_gpu": 2312.73 }, { "epoch": 0.6421010585180746, "grad_norm": 3.921875, "learning_rate": 2.3462044582378252e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6430, "tokens_per_second_per_gpu": 2381.74 }, { "epoch": 0.6430996604753345, "grad_norm": 3.203125, "learning_rate": 2.345732806577259e-05, "loss": 0.5052, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6440, "tokens_per_second_per_gpu": 1579.19 }, { "epoch": 0.6440982624325944, "grad_norm": 2.53125, "learning_rate": 2.3452604803634253e-05, "loss": 0.4741, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6450, "tokens_per_second_per_gpu": 2416.04 }, { "epoch": 0.6450968643898543, "grad_norm": 3.28125, "learning_rate": 2.3447874798870967e-05, "loss": 0.4842, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6460, "tokens_per_second_per_gpu": 2497.58 }, { "epoch": 0.646095466347114, "grad_norm": 3.453125, "learning_rate": 2.3443138054394616e-05, "loss": 0.573, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6470, "tokens_per_second_per_gpu": 2470.82 }, { "epoch": 0.6470940683043739, "grad_norm": 3.859375, "learning_rate": 2.343839457312122e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6480, "tokens_per_second_per_gpu": 2508.61 }, { "epoch": 0.6480926702616338, "grad_norm": 2.609375, "learning_rate": 2.3433644357970956e-05, "loss": 0.4742, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6490, "tokens_per_second_per_gpu": 2361.45 }, { "epoch": 0.6490912722188935, "grad_norm": 3.234375, "learning_rate": 2.3428887411868144e-05, "loss": 0.5069, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6500, "tokens_per_second_per_gpu": 2573.21 }, { "epoch": 0.6500898741761534, "grad_norm": 2.375, "learning_rate": 2.342412373774125e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6510, "tokens_per_second_per_gpu": 2390.31 }, { "epoch": 0.6510884761334133, "grad_norm": 3.34375, "learning_rate": 2.3419353338522877e-05, "loss": 0.5201, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6520, "tokens_per_second_per_gpu": 2412.2 }, { "epoch": 0.652087078090673, "grad_norm": 2.734375, "learning_rate": 2.341457621714977e-05, "loss": 0.477, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6530, "tokens_per_second_per_gpu": 2419.49 }, { "epoch": 0.6530856800479329, "grad_norm": 2.71875, "learning_rate": 2.340979237656281e-05, "loss": 0.5396, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6540, "tokens_per_second_per_gpu": 2518.31 }, { "epoch": 0.6540842820051928, "grad_norm": 2.578125, "learning_rate": 2.3405001819707027e-05, "loss": 0.5651, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6550, "tokens_per_second_per_gpu": 2675.52 }, { "epoch": 0.6550828839624525, "grad_norm": 3.171875, "learning_rate": 2.340020454953157e-05, "loss": 0.5521, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6560, "tokens_per_second_per_gpu": 2316.57 }, { "epoch": 0.6560814859197124, "grad_norm": 3.203125, "learning_rate": 2.3395400568989723e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6570, "tokens_per_second_per_gpu": 2570.83 }, { "epoch": 0.6570800878769723, "grad_norm": 2.78125, "learning_rate": 2.3390589881038904e-05, "loss": 0.5669, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6580, "tokens_per_second_per_gpu": 2453.61 }, { "epoch": 0.6580786898342321, "grad_norm": 2.625, "learning_rate": 2.3385772488640672e-05, "loss": 0.509, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6590, "tokens_per_second_per_gpu": 2478.86 }, { "epoch": 0.6590772917914919, "grad_norm": 4.46875, "learning_rate": 2.3380948394760694e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6600, "tokens_per_second_per_gpu": 2574.84 }, { "epoch": 0.6600758937487518, "grad_norm": 3.9375, "learning_rate": 2.3376117602368776e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6610, "tokens_per_second_per_gpu": 2408.42 }, { "epoch": 0.6610744957060116, "grad_norm": 4.09375, "learning_rate": 2.3371280114438833e-05, "loss": 0.4688, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6620, "tokens_per_second_per_gpu": 2354.74 }, { "epoch": 0.6620730976632714, "grad_norm": 3.25, "learning_rate": 2.3366435933948915e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6630, "tokens_per_second_per_gpu": 2481.68 }, { "epoch": 0.6630716996205313, "grad_norm": 2.875, "learning_rate": 2.3361585063881198e-05, "loss": 0.5635, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6640, "tokens_per_second_per_gpu": 2369.43 }, { "epoch": 0.6640703015777911, "grad_norm": 2.6875, "learning_rate": 2.335672750722196e-05, "loss": 0.4554, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6650, "tokens_per_second_per_gpu": 2360.28 }, { "epoch": 0.6650689035350509, "grad_norm": 2.6875, "learning_rate": 2.3351863266961598e-05, "loss": 0.525, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6660, "tokens_per_second_per_gpu": 2265.32 }, { "epoch": 0.6660675054923108, "grad_norm": 2.5625, "learning_rate": 2.3346992346094633e-05, "loss": 0.5632, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6670, "tokens_per_second_per_gpu": 2498.8 }, { "epoch": 0.6670661074495706, "grad_norm": 4.125, "learning_rate": 2.3342114747619692e-05, "loss": 0.5214, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6680, "tokens_per_second_per_gpu": 2449.85 }, { "epoch": 0.6680647094068305, "grad_norm": 3.1875, "learning_rate": 2.333723047453952e-05, "loss": 0.5811, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6690, "tokens_per_second_per_gpu": 2385.67 }, { "epoch": 0.6690633113640903, "grad_norm": 3.46875, "learning_rate": 2.3332339529860956e-05, "loss": 0.5717, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6700, "tokens_per_second_per_gpu": 2267.59 }, { "epoch": 0.6700619133213501, "grad_norm": 2.71875, "learning_rate": 2.3327441916594957e-05, "loss": 0.554, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6710, "tokens_per_second_per_gpu": 2692.34 }, { "epoch": 0.67106051527861, "grad_norm": 2.46875, "learning_rate": 2.3322537637756592e-05, "loss": 0.5155, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6720, "tokens_per_second_per_gpu": 2299.34 }, { "epoch": 0.6720591172358698, "grad_norm": 2.765625, "learning_rate": 2.3317626696365013e-05, "loss": 0.4726, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6730, "tokens_per_second_per_gpu": 2434.43 }, { "epoch": 0.6730577191931296, "grad_norm": 3.125, "learning_rate": 2.33127090954435e-05, "loss": 0.5377, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6740, "tokens_per_second_per_gpu": 2353.23 }, { "epoch": 0.6740563211503895, "grad_norm": 3.125, "learning_rate": 2.330778483801941e-05, "loss": 0.5512, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6750, "tokens_per_second_per_gpu": 2455.87 }, { "epoch": 0.6750549231076493, "grad_norm": 3.296875, "learning_rate": 2.3302853927124203e-05, "loss": 0.5802, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6760, "tokens_per_second_per_gpu": 2403.47 }, { "epoch": 0.6760535250649091, "grad_norm": 3.0625, "learning_rate": 2.329791636579344e-05, "loss": 0.4898, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6770, "tokens_per_second_per_gpu": 2439.32 }, { "epoch": 0.677052127022169, "grad_norm": 3.375, "learning_rate": 2.3292972157066785e-05, "loss": 0.5873, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6780, "tokens_per_second_per_gpu": 2517.71 }, { "epoch": 0.6780507289794288, "grad_norm": 3.328125, "learning_rate": 2.3288021303987974e-05, "loss": 0.5384, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6790, "tokens_per_second_per_gpu": 2337.9 }, { "epoch": 0.6790493309366886, "grad_norm": 3.0625, "learning_rate": 2.3283063809604848e-05, "loss": 0.557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6800, "tokens_per_second_per_gpu": 2460.23 }, { "epoch": 0.6800479328939485, "grad_norm": 3.046875, "learning_rate": 2.3278099676969325e-05, "loss": 0.5572, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6810, "tokens_per_second_per_gpu": 2422.17 }, { "epoch": 0.6810465348512084, "grad_norm": 3.140625, "learning_rate": 2.327312890913742e-05, "loss": 0.5814, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6820, "tokens_per_second_per_gpu": 2366.69 }, { "epoch": 0.6820451368084681, "grad_norm": 3.734375, "learning_rate": 2.3268151509169233e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6830, "tokens_per_second_per_gpu": 2381.57 }, { "epoch": 0.683043738765728, "grad_norm": 2.546875, "learning_rate": 2.3263167480128938e-05, "loss": 0.4473, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6840, "tokens_per_second_per_gpu": 2333.71 }, { "epoch": 0.6840423407229879, "grad_norm": 3.25, "learning_rate": 2.32581768250848e-05, "loss": 0.5636, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6850, "tokens_per_second_per_gpu": 2466.48 }, { "epoch": 0.6850409426802476, "grad_norm": 4.40625, "learning_rate": 2.325317954710915e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6860, "tokens_per_second_per_gpu": 2504.67 }, { "epoch": 0.6860395446375075, "grad_norm": 3.890625, "learning_rate": 2.3248175649278412e-05, "loss": 0.5075, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6870, "tokens_per_second_per_gpu": 2422.32 }, { "epoch": 0.6870381465947674, "grad_norm": 3.015625, "learning_rate": 2.3243165134673074e-05, "loss": 0.5325, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6880, "tokens_per_second_per_gpu": 2311.97 }, { "epoch": 0.6880367485520271, "grad_norm": 2.84375, "learning_rate": 2.3238148006377696e-05, "loss": 0.5173, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6890, "tokens_per_second_per_gpu": 2550.78 }, { "epoch": 0.689035350509287, "grad_norm": 3.09375, "learning_rate": 2.3233124267480928e-05, "loss": 0.5285, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6900, "tokens_per_second_per_gpu": 2353.74 }, { "epoch": 0.6900339524665469, "grad_norm": 3.546875, "learning_rate": 2.322809392107546e-05, "loss": 0.559, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6910, "tokens_per_second_per_gpu": 2332.07 }, { "epoch": 0.6910325544238066, "grad_norm": 2.953125, "learning_rate": 2.3223056970258078e-05, "loss": 0.5789, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6920, "tokens_per_second_per_gpu": 2275.78 }, { "epoch": 0.6920311563810665, "grad_norm": 3.578125, "learning_rate": 2.3218013418129618e-05, "loss": 0.5572, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6930, "tokens_per_second_per_gpu": 2492.71 }, { "epoch": 0.6930297583383264, "grad_norm": 2.953125, "learning_rate": 2.321296326779498e-05, "loss": 0.5665, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6940, "tokens_per_second_per_gpu": 2536.83 }, { "epoch": 0.6940283602955862, "grad_norm": 2.71875, "learning_rate": 2.3207906522363133e-05, "loss": 0.5734, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6950, "tokens_per_second_per_gpu": 2396.56 }, { "epoch": 0.695026962252846, "grad_norm": 3.25, "learning_rate": 2.3202843184947105e-05, "loss": 0.457, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6960, "tokens_per_second_per_gpu": 2154.0 }, { "epoch": 0.6960255642101059, "grad_norm": 3.0, "learning_rate": 2.319777325866397e-05, "loss": 0.5339, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6970, "tokens_per_second_per_gpu": 2304.08 }, { "epoch": 0.6970241661673657, "grad_norm": 3.515625, "learning_rate": 2.3192696746634875e-05, "loss": 0.5745, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6980, "tokens_per_second_per_gpu": 2525.33 }, { "epoch": 0.6980227681246255, "grad_norm": 3.125, "learning_rate": 2.3187613651985017e-05, "loss": 0.5382, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 6990, "tokens_per_second_per_gpu": 2364.81 }, { "epoch": 0.6990213700818854, "grad_norm": 2.890625, "learning_rate": 2.3182523977843637e-05, "loss": 0.4892, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7000, "tokens_per_second_per_gpu": 2545.07 }, { "epoch": 0.7000199720391452, "grad_norm": 3.625, "learning_rate": 2.317742772734403e-05, "loss": 0.5606, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7010, "tokens_per_second_per_gpu": 2281.94 }, { "epoch": 0.701018573996405, "grad_norm": 3.484375, "learning_rate": 2.317232490362355e-05, "loss": 0.5464, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7020, "tokens_per_second_per_gpu": 2285.52 }, { "epoch": 0.7020171759536649, "grad_norm": 3.71875, "learning_rate": 2.3167215509823583e-05, "loss": 0.5433, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7030, "tokens_per_second_per_gpu": 2402.18 }, { "epoch": 0.7030157779109247, "grad_norm": 3.140625, "learning_rate": 2.3162099549089566e-05, "loss": 0.5261, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7040, "tokens_per_second_per_gpu": 2287.22 }, { "epoch": 0.7040143798681845, "grad_norm": 2.9375, "learning_rate": 2.315697702457098e-05, "loss": 0.5437, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7050, "tokens_per_second_per_gpu": 2387.8 }, { "epoch": 0.7050129818254444, "grad_norm": 3.03125, "learning_rate": 2.3151847939421348e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7060, "tokens_per_second_per_gpu": 2282.69 }, { "epoch": 0.7060115837827042, "grad_norm": 3.328125, "learning_rate": 2.3146712296798223e-05, "loss": 0.5603, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7070, "tokens_per_second_per_gpu": 2206.5 }, { "epoch": 0.7070101857399641, "grad_norm": 2.78125, "learning_rate": 2.3141570099863206e-05, "loss": 0.5188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7080, "tokens_per_second_per_gpu": 2288.83 }, { "epoch": 0.7080087876972239, "grad_norm": 2.921875, "learning_rate": 2.3136421351781923e-05, "loss": 0.5354, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7090, "tokens_per_second_per_gpu": 2443.25 }, { "epoch": 0.7090073896544837, "grad_norm": 2.640625, "learning_rate": 2.3131266055724045e-05, "loss": 0.5757, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7100, "tokens_per_second_per_gpu": 2288.69 }, { "epoch": 0.7100059916117436, "grad_norm": 2.484375, "learning_rate": 2.312610421486326e-05, "loss": 0.5682, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7110, "tokens_per_second_per_gpu": 2410.45 }, { "epoch": 0.7110045935690034, "grad_norm": 3.84375, "learning_rate": 2.31209358323773e-05, "loss": 0.5417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7120, "tokens_per_second_per_gpu": 2399.9 }, { "epoch": 0.7120031955262632, "grad_norm": 3.15625, "learning_rate": 2.3115760911447906e-05, "loss": 0.4968, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7130, "tokens_per_second_per_gpu": 2361.98 }, { "epoch": 0.7130017974835231, "grad_norm": 2.765625, "learning_rate": 2.311057945526086e-05, "loss": 0.4429, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7140, "tokens_per_second_per_gpu": 2553.69 }, { "epoch": 0.7140003994407829, "grad_norm": 3.78125, "learning_rate": 2.3105391467005967e-05, "loss": 0.536, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7150, "tokens_per_second_per_gpu": 2363.76 }, { "epoch": 0.7149990013980427, "grad_norm": 2.265625, "learning_rate": 2.3100196949877044e-05, "loss": 0.4759, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7160, "tokens_per_second_per_gpu": 2283.97 }, { "epoch": 0.7159976033553026, "grad_norm": 3.21875, "learning_rate": 2.3094995907071925e-05, "loss": 0.5372, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7170, "tokens_per_second_per_gpu": 2337.5 }, { "epoch": 0.7169962053125624, "grad_norm": 3.953125, "learning_rate": 2.3089788341792473e-05, "loss": 0.5237, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7180, "tokens_per_second_per_gpu": 2475.47 }, { "epoch": 0.7179948072698222, "grad_norm": 3.328125, "learning_rate": 2.308457425724456e-05, "loss": 0.5702, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7190, "tokens_per_second_per_gpu": 2290.77 }, { "epoch": 0.7189934092270821, "grad_norm": 2.75, "learning_rate": 2.307935365663808e-05, "loss": 0.552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7200, "tokens_per_second_per_gpu": 2528.13 }, { "epoch": 0.719992011184342, "grad_norm": 2.765625, "learning_rate": 2.307412654318692e-05, "loss": 0.4913, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7210, "tokens_per_second_per_gpu": 2366.5 }, { "epoch": 0.7209906131416017, "grad_norm": 3.015625, "learning_rate": 2.306889292010899e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7220, "tokens_per_second_per_gpu": 2369.08 }, { "epoch": 0.7219892150988616, "grad_norm": 2.75, "learning_rate": 2.3063652790626207e-05, "loss": 0.5006, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7230, "tokens_per_second_per_gpu": 2635.17 }, { "epoch": 0.7229878170561215, "grad_norm": 3.21875, "learning_rate": 2.3058406157964485e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7240, "tokens_per_second_per_gpu": 2564.86 }, { "epoch": 0.7239864190133812, "grad_norm": 3.265625, "learning_rate": 2.305315302535376e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7250, "tokens_per_second_per_gpu": 2362.79 }, { "epoch": 0.7249850209706411, "grad_norm": 2.359375, "learning_rate": 2.304789339602794e-05, "loss": 0.5307, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7260, "tokens_per_second_per_gpu": 2328.38 }, { "epoch": 0.725983622927901, "grad_norm": 3.75, "learning_rate": 2.3042627273224967e-05, "loss": 0.5656, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7270, "tokens_per_second_per_gpu": 2415.82 }, { "epoch": 0.7269822248851607, "grad_norm": 2.828125, "learning_rate": 2.3037354660186754e-05, "loss": 0.5345, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7280, "tokens_per_second_per_gpu": 2287.31 }, { "epoch": 0.7279808268424206, "grad_norm": 3.046875, "learning_rate": 2.3032075560159216e-05, "loss": 0.5189, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7290, "tokens_per_second_per_gpu": 2276.4 }, { "epoch": 0.7289794287996805, "grad_norm": 3.8125, "learning_rate": 2.302678997639227e-05, "loss": 0.5787, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7300, "tokens_per_second_per_gpu": 2392.92 }, { "epoch": 0.7299780307569402, "grad_norm": 2.796875, "learning_rate": 2.3021497912139818e-05, "loss": 0.5499, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7310, "tokens_per_second_per_gpu": 2171.75 }, { "epoch": 0.7309766327142001, "grad_norm": 3.125, "learning_rate": 2.3016199370659743e-05, "loss": 0.4933, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7320, "tokens_per_second_per_gpu": 2325.68 }, { "epoch": 0.73197523467146, "grad_norm": 3.09375, "learning_rate": 2.3010894355213936e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7330, "tokens_per_second_per_gpu": 2644.43 }, { "epoch": 0.7329738366287198, "grad_norm": 3.515625, "learning_rate": 2.3005582869068258e-05, "loss": 0.5448, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7340, "tokens_per_second_per_gpu": 2217.24 }, { "epoch": 0.7339724385859796, "grad_norm": 2.5, "learning_rate": 2.3000264915492558e-05, "loss": 0.465, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7350, "tokens_per_second_per_gpu": 2644.47 }, { "epoch": 0.7349710405432395, "grad_norm": 3.265625, "learning_rate": 2.2994940497760665e-05, "loss": 0.5792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7360, "tokens_per_second_per_gpu": 2276.74 }, { "epoch": 0.7359696425004993, "grad_norm": 3.921875, "learning_rate": 2.2989609619150387e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7370, "tokens_per_second_per_gpu": 2352.0 }, { "epoch": 0.7369682444577591, "grad_norm": 4.21875, "learning_rate": 2.2984272282943515e-05, "loss": 0.5448, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7380, "tokens_per_second_per_gpu": 2319.48 }, { "epoch": 0.737966846415019, "grad_norm": 2.765625, "learning_rate": 2.297892849242581e-05, "loss": 0.5394, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7390, "tokens_per_second_per_gpu": 2427.45 }, { "epoch": 0.7389654483722788, "grad_norm": 3.484375, "learning_rate": 2.2973578250887008e-05, "loss": 0.5814, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7400, "tokens_per_second_per_gpu": 2458.18 }, { "epoch": 0.7399640503295386, "grad_norm": 2.671875, "learning_rate": 2.296822156162081e-05, "loss": 0.4813, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7410, "tokens_per_second_per_gpu": 2411.48 }, { "epoch": 0.7409626522867985, "grad_norm": 3.125, "learning_rate": 2.29628584279249e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7420, "tokens_per_second_per_gpu": 2220.23 }, { "epoch": 0.7419612542440583, "grad_norm": 4.1875, "learning_rate": 2.295748885310092e-05, "loss": 0.4731, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7430, "tokens_per_second_per_gpu": 2265.01 }, { "epoch": 0.7429598562013181, "grad_norm": 2.765625, "learning_rate": 2.2952112840454476e-05, "loss": 0.5564, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7440, "tokens_per_second_per_gpu": 2495.87 }, { "epoch": 0.743958458158578, "grad_norm": 3.3125, "learning_rate": 2.2946730393295145e-05, "loss": 0.5205, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7450, "tokens_per_second_per_gpu": 2418.8 }, { "epoch": 0.7449570601158378, "grad_norm": 3.6875, "learning_rate": 2.2941341514936454e-05, "loss": 0.4855, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7460, "tokens_per_second_per_gpu": 2295.14 }, { "epoch": 0.7459556620730977, "grad_norm": 3.90625, "learning_rate": 2.2935946208695902e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7470, "tokens_per_second_per_gpu": 2271.94 }, { "epoch": 0.7469542640303575, "grad_norm": 4.0625, "learning_rate": 2.2930544477894936e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7480, "tokens_per_second_per_gpu": 2558.26 }, { "epoch": 0.7479528659876173, "grad_norm": 3.015625, "learning_rate": 2.2925136325858962e-05, "loss": 0.4796, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7490, "tokens_per_second_per_gpu": 2490.03 }, { "epoch": 0.7489514679448772, "grad_norm": 3.890625, "learning_rate": 2.2919721755917333e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7500, "tokens_per_second_per_gpu": 2555.49 }, { "epoch": 0.749950069902137, "grad_norm": 2.453125, "learning_rate": 2.291430077140337e-05, "loss": 0.5501, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7510, "tokens_per_second_per_gpu": 2397.4 }, { "epoch": 0.7509486718593968, "grad_norm": 3.34375, "learning_rate": 2.2908873375654314e-05, "loss": 0.5462, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7520, "tokens_per_second_per_gpu": 2462.49 }, { "epoch": 0.7519472738166567, "grad_norm": 2.828125, "learning_rate": 2.290343957201138e-05, "loss": 0.5414, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7530, "tokens_per_second_per_gpu": 2351.67 }, { "epoch": 0.7529458757739165, "grad_norm": 3.703125, "learning_rate": 2.2897999363819716e-05, "loss": 0.5669, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7540, "tokens_per_second_per_gpu": 2431.55 }, { "epoch": 0.7539444777311763, "grad_norm": 3.046875, "learning_rate": 2.2892552754428414e-05, "loss": 0.5552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7550, "tokens_per_second_per_gpu": 2712.01 }, { "epoch": 0.7549430796884362, "grad_norm": 2.84375, "learning_rate": 2.288709974719051e-05, "loss": 0.5529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7560, "tokens_per_second_per_gpu": 2276.27 }, { "epoch": 0.755941681645696, "grad_norm": 3.40625, "learning_rate": 2.2881640345462968e-05, "loss": 0.5499, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7570, "tokens_per_second_per_gpu": 2387.67 }, { "epoch": 0.7569402836029558, "grad_norm": 3.203125, "learning_rate": 2.2876174552606702e-05, "loss": 0.5291, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7580, "tokens_per_second_per_gpu": 2293.15 }, { "epoch": 0.7579388855602157, "grad_norm": 3.078125, "learning_rate": 2.2870702371986553e-05, "loss": 0.497, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7590, "tokens_per_second_per_gpu": 2539.85 }, { "epoch": 0.7589374875174756, "grad_norm": 4.03125, "learning_rate": 2.2865223806971296e-05, "loss": 0.5481, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7600, "tokens_per_second_per_gpu": 2310.48 }, { "epoch": 0.7599360894747353, "grad_norm": 3.078125, "learning_rate": 2.2859738860933637e-05, "loss": 0.5349, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7610, "tokens_per_second_per_gpu": 2495.11 }, { "epoch": 0.7609346914319952, "grad_norm": 2.203125, "learning_rate": 2.2854247537250207e-05, "loss": 0.5207, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7620, "tokens_per_second_per_gpu": 2522.13 }, { "epoch": 0.7619332933892551, "grad_norm": 3.671875, "learning_rate": 2.284874983930157e-05, "loss": 0.5002, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7630, "tokens_per_second_per_gpu": 2432.69 }, { "epoch": 0.7629318953465148, "grad_norm": 2.703125, "learning_rate": 2.2843245770472206e-05, "loss": 0.556, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7640, "tokens_per_second_per_gpu": 2412.17 }, { "epoch": 0.7639304973037747, "grad_norm": 2.859375, "learning_rate": 2.2837735334150522e-05, "loss": 0.5373, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7650, "tokens_per_second_per_gpu": 2462.07 }, { "epoch": 0.7649290992610346, "grad_norm": 3.109375, "learning_rate": 2.283221853372885e-05, "loss": 0.5064, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7660, "tokens_per_second_per_gpu": 2462.24 }, { "epoch": 0.7659277012182943, "grad_norm": 3.75, "learning_rate": 2.2826695372603423e-05, "loss": 0.5419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7670, "tokens_per_second_per_gpu": 2535.41 }, { "epoch": 0.7669263031755542, "grad_norm": 2.40625, "learning_rate": 2.282116585417441e-05, "loss": 0.5544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7680, "tokens_per_second_per_gpu": 2502.21 }, { "epoch": 0.7679249051328141, "grad_norm": 3.109375, "learning_rate": 2.2815629981845876e-05, "loss": 0.5378, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7690, "tokens_per_second_per_gpu": 2334.01 }, { "epoch": 0.7689235070900738, "grad_norm": 4.28125, "learning_rate": 2.2810087759025816e-05, "loss": 0.5439, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7700, "tokens_per_second_per_gpu": 2448.19 }, { "epoch": 0.7699221090473337, "grad_norm": 2.796875, "learning_rate": 2.2804539189126114e-05, "loss": 0.5336, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7710, "tokens_per_second_per_gpu": 2568.44 }, { "epoch": 0.7709207110045936, "grad_norm": 2.875, "learning_rate": 2.279898427556258e-05, "loss": 0.4611, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7720, "tokens_per_second_per_gpu": 2355.85 }, { "epoch": 0.7719193129618535, "grad_norm": 2.859375, "learning_rate": 2.279342302175491e-05, "loss": 0.5298, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7730, "tokens_per_second_per_gpu": 2523.22 }, { "epoch": 0.7729179149191132, "grad_norm": 3.65625, "learning_rate": 2.2787855431126725e-05, "loss": 0.5819, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7740, "tokens_per_second_per_gpu": 2335.88 }, { "epoch": 0.7739165168763731, "grad_norm": 3.4375, "learning_rate": 2.2782281507105536e-05, "loss": 0.5648, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7750, "tokens_per_second_per_gpu": 2541.94 }, { "epoch": 0.774915118833633, "grad_norm": 3.984375, "learning_rate": 2.2776701253122746e-05, "loss": 0.5873, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7760, "tokens_per_second_per_gpu": 2538.55 }, { "epoch": 0.7759137207908927, "grad_norm": 4.15625, "learning_rate": 2.2771114672613665e-05, "loss": 0.5665, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7770, "tokens_per_second_per_gpu": 2249.96 }, { "epoch": 0.7769123227481526, "grad_norm": 2.203125, "learning_rate": 2.27655217690175e-05, "loss": 0.5088, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7780, "tokens_per_second_per_gpu": 2460.63 }, { "epoch": 0.7779109247054125, "grad_norm": 2.625, "learning_rate": 2.2759922545777333e-05, "loss": 0.5716, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7790, "tokens_per_second_per_gpu": 2432.44 }, { "epoch": 0.7789095266626722, "grad_norm": 2.953125, "learning_rate": 2.2754317006340163e-05, "loss": 0.5654, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7800, "tokens_per_second_per_gpu": 2439.62 }, { "epoch": 0.7799081286199321, "grad_norm": 3.015625, "learning_rate": 2.2748705154156854e-05, "loss": 0.5326, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7810, "tokens_per_second_per_gpu": 2454.17 }, { "epoch": 0.780906730577192, "grad_norm": 2.84375, "learning_rate": 2.2743086992682168e-05, "loss": 0.5758, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7820, "tokens_per_second_per_gpu": 2393.95 }, { "epoch": 0.7819053325344517, "grad_norm": 2.421875, "learning_rate": 2.2737462525374747e-05, "loss": 0.5469, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7830, "tokens_per_second_per_gpu": 2462.24 }, { "epoch": 0.7829039344917116, "grad_norm": 4.0, "learning_rate": 2.273183175569712e-05, "loss": 0.5348, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7840, "tokens_per_second_per_gpu": 2306.04 }, { "epoch": 0.7839025364489715, "grad_norm": 3.140625, "learning_rate": 2.272619468711569e-05, "loss": 0.5661, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7850, "tokens_per_second_per_gpu": 2517.93 }, { "epoch": 0.7849011384062313, "grad_norm": 3.578125, "learning_rate": 2.272055132310074e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7860, "tokens_per_second_per_gpu": 2366.89 }, { "epoch": 0.7858997403634911, "grad_norm": 2.828125, "learning_rate": 2.271490166712643e-05, "loss": 0.5423, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7870, "tokens_per_second_per_gpu": 2471.86 }, { "epoch": 0.786898342320751, "grad_norm": 3.359375, "learning_rate": 2.270924572267079e-05, "loss": 0.579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7880, "tokens_per_second_per_gpu": 2515.07 }, { "epoch": 0.7878969442780108, "grad_norm": 3.53125, "learning_rate": 2.2703583493215726e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7890, "tokens_per_second_per_gpu": 2386.43 }, { "epoch": 0.7888955462352706, "grad_norm": 2.953125, "learning_rate": 2.269791498224701e-05, "loss": 0.5594, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7900, "tokens_per_second_per_gpu": 2442.99 }, { "epoch": 0.7898941481925305, "grad_norm": 2.25, "learning_rate": 2.2692240193254276e-05, "loss": 0.4759, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7910, "tokens_per_second_per_gpu": 2473.48 }, { "epoch": 0.7908927501497903, "grad_norm": 3.171875, "learning_rate": 2.268655912973104e-05, "loss": 0.5096, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7920, "tokens_per_second_per_gpu": 2379.89 }, { "epoch": 0.7918913521070501, "grad_norm": 2.90625, "learning_rate": 2.268087179517466e-05, "loss": 0.4953, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7930, "tokens_per_second_per_gpu": 2336.16 }, { "epoch": 0.79288995406431, "grad_norm": 4.34375, "learning_rate": 2.267517819308636e-05, "loss": 0.5291, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7940, "tokens_per_second_per_gpu": 2167.25 }, { "epoch": 0.7938885560215698, "grad_norm": 3.578125, "learning_rate": 2.266947832697124e-05, "loss": 0.514, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7950, "tokens_per_second_per_gpu": 2383.76 }, { "epoch": 0.7948871579788296, "grad_norm": 2.71875, "learning_rate": 2.2663772200338232e-05, "loss": 0.5056, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7960, "tokens_per_second_per_gpu": 2417.16 }, { "epoch": 0.7958857599360895, "grad_norm": 3.34375, "learning_rate": 2.2658059816700135e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7970, "tokens_per_second_per_gpu": 2381.21 }, { "epoch": 0.7968843618933493, "grad_norm": 2.828125, "learning_rate": 2.26523411795736e-05, "loss": 0.5632, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7980, "tokens_per_second_per_gpu": 2692.3 }, { "epoch": 0.7978829638506092, "grad_norm": 3.3125, "learning_rate": 2.2646616292479123e-05, "loss": 0.5083, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 7990, "tokens_per_second_per_gpu": 2592.2 }, { "epoch": 0.798881565807869, "grad_norm": 3.40625, "learning_rate": 2.2640885158941048e-05, "loss": 0.5337, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8000, "tokens_per_second_per_gpu": 2286.79 }, { "epoch": 0.7998801677651288, "grad_norm": 3.421875, "learning_rate": 2.2635147782487564e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8010, "tokens_per_second_per_gpu": 2327.43 }, { "epoch": 0.8008787697223887, "grad_norm": 3.375, "learning_rate": 2.2629404166650715e-05, "loss": 0.5278, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8020, "tokens_per_second_per_gpu": 2389.36 }, { "epoch": 0.8018773716796485, "grad_norm": 2.75, "learning_rate": 2.2623654314966368e-05, "loss": 0.5206, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8030, "tokens_per_second_per_gpu": 2467.61 }, { "epoch": 0.8028759736369083, "grad_norm": 3.5625, "learning_rate": 2.2617898230974243e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8040, "tokens_per_second_per_gpu": 2528.73 }, { "epoch": 0.8038745755941682, "grad_norm": 2.5625, "learning_rate": 2.261213591821789e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8050, "tokens_per_second_per_gpu": 2309.69 }, { "epoch": 0.804873177551428, "grad_norm": 2.578125, "learning_rate": 2.2606367380244688e-05, "loss": 0.5802, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8060, "tokens_per_second_per_gpu": 2442.32 }, { "epoch": 0.8058717795086878, "grad_norm": 2.796875, "learning_rate": 2.2600592620605865e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8070, "tokens_per_second_per_gpu": 2379.37 }, { "epoch": 0.8068703814659477, "grad_norm": 3.765625, "learning_rate": 2.259481164285646e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8080, "tokens_per_second_per_gpu": 2443.18 }, { "epoch": 0.8078689834232075, "grad_norm": 2.703125, "learning_rate": 2.2589024450555357e-05, "loss": 0.5633, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8090, "tokens_per_second_per_gpu": 2492.64 }, { "epoch": 0.8088675853804673, "grad_norm": 3.71875, "learning_rate": 2.258323104726525e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8100, "tokens_per_second_per_gpu": 2350.55 }, { "epoch": 0.8098661873377272, "grad_norm": 3.6875, "learning_rate": 2.2577431436552676e-05, "loss": 0.5215, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8110, "tokens_per_second_per_gpu": 2367.5 }, { "epoch": 0.8108647892949871, "grad_norm": 3.25, "learning_rate": 2.2571625621987973e-05, "loss": 0.5561, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8120, "tokens_per_second_per_gpu": 2550.24 }, { "epoch": 0.8118633912522468, "grad_norm": 3.046875, "learning_rate": 2.2565813607145308e-05, "loss": 0.4721, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8130, "tokens_per_second_per_gpu": 2424.45 }, { "epoch": 0.8128619932095067, "grad_norm": 3.28125, "learning_rate": 2.255999539560267e-05, "loss": 0.5161, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8140, "tokens_per_second_per_gpu": 2517.62 }, { "epoch": 0.8138605951667666, "grad_norm": 3.953125, "learning_rate": 2.255417099094185e-05, "loss": 0.5448, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8150, "tokens_per_second_per_gpu": 2434.71 }, { "epoch": 0.8148591971240263, "grad_norm": 2.671875, "learning_rate": 2.254834039674846e-05, "loss": 0.5371, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8160, "tokens_per_second_per_gpu": 2211.14 }, { "epoch": 0.8158577990812862, "grad_norm": 3.078125, "learning_rate": 2.2542503616611926e-05, "loss": 0.5499, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8170, "tokens_per_second_per_gpu": 2433.32 }, { "epoch": 0.8168564010385461, "grad_norm": 3.1875, "learning_rate": 2.2536660654125467e-05, "loss": 0.5142, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8180, "tokens_per_second_per_gpu": 2347.09 }, { "epoch": 0.8178550029958058, "grad_norm": 2.515625, "learning_rate": 2.2530811512886132e-05, "loss": 0.505, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8190, "tokens_per_second_per_gpu": 2487.67 }, { "epoch": 0.8188536049530657, "grad_norm": 2.859375, "learning_rate": 2.2524956196494752e-05, "loss": 0.5634, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8200, "tokens_per_second_per_gpu": 2507.82 }, { "epoch": 0.8198522069103256, "grad_norm": 3.09375, "learning_rate": 2.2519094708555965e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8210, "tokens_per_second_per_gpu": 2200.55 }, { "epoch": 0.8208508088675854, "grad_norm": 3.09375, "learning_rate": 2.2513227052678216e-05, "loss": 0.476, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8220, "tokens_per_second_per_gpu": 2341.89 }, { "epoch": 0.8218494108248452, "grad_norm": 3.109375, "learning_rate": 2.2507353232473738e-05, "loss": 0.5049, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8230, "tokens_per_second_per_gpu": 2332.96 }, { "epoch": 0.8228480127821051, "grad_norm": 2.375, "learning_rate": 2.2501473251558568e-05, "loss": 0.5417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8240, "tokens_per_second_per_gpu": 2504.4 }, { "epoch": 0.8238466147393649, "grad_norm": 2.953125, "learning_rate": 2.2495587113552525e-05, "loss": 0.5401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8250, "tokens_per_second_per_gpu": 2392.25 }, { "epoch": 0.8248452166966247, "grad_norm": 2.90625, "learning_rate": 2.2489694822079227e-05, "loss": 0.4868, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8260, "tokens_per_second_per_gpu": 2354.64 }, { "epoch": 0.8258438186538846, "grad_norm": 3.578125, "learning_rate": 2.2483796380766072e-05, "loss": 0.5724, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8270, "tokens_per_second_per_gpu": 2410.26 }, { "epoch": 0.8268424206111444, "grad_norm": 3.015625, "learning_rate": 2.2477891793244257e-05, "loss": 0.5375, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8280, "tokens_per_second_per_gpu": 2465.44 }, { "epoch": 0.8278410225684042, "grad_norm": 2.765625, "learning_rate": 2.247198106314875e-05, "loss": 0.4739, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8290, "tokens_per_second_per_gpu": 2475.41 }, { "epoch": 0.8288396245256641, "grad_norm": 3.078125, "learning_rate": 2.24660641941183e-05, "loss": 0.5512, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8300, "tokens_per_second_per_gpu": 2497.96 }, { "epoch": 0.8298382264829239, "grad_norm": 3.03125, "learning_rate": 2.2460141189795453e-05, "loss": 0.4953, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8310, "tokens_per_second_per_gpu": 2393.72 }, { "epoch": 0.8308368284401837, "grad_norm": 3.453125, "learning_rate": 2.2454212053826513e-05, "loss": 0.5644, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8320, "tokens_per_second_per_gpu": 2346.15 }, { "epoch": 0.8318354303974436, "grad_norm": 2.375, "learning_rate": 2.244827678986156e-05, "loss": 0.5691, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8330, "tokens_per_second_per_gpu": 2324.96 }, { "epoch": 0.8328340323547034, "grad_norm": 2.90625, "learning_rate": 2.244233540155446e-05, "loss": 0.5595, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8340, "tokens_per_second_per_gpu": 2500.1 }, { "epoch": 0.8338326343119633, "grad_norm": 3.21875, "learning_rate": 2.2436387892562834e-05, "loss": 0.5794, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8350, "tokens_per_second_per_gpu": 2307.01 }, { "epoch": 0.8348312362692231, "grad_norm": 3.171875, "learning_rate": 2.243043426654808e-05, "loss": 0.4888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8360, "tokens_per_second_per_gpu": 2174.5 }, { "epoch": 0.8358298382264829, "grad_norm": 3.328125, "learning_rate": 2.2424474527175364e-05, "loss": 0.557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8370, "tokens_per_second_per_gpu": 2300.98 }, { "epoch": 0.8368284401837428, "grad_norm": 3.375, "learning_rate": 2.2418508678113602e-05, "loss": 0.5397, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8380, "tokens_per_second_per_gpu": 2091.6 }, { "epoch": 0.8378270421410026, "grad_norm": 2.765625, "learning_rate": 2.2412536723035494e-05, "loss": 0.4914, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8390, "tokens_per_second_per_gpu": 2439.76 }, { "epoch": 0.8388256440982624, "grad_norm": 3.4375, "learning_rate": 2.2406558665617472e-05, "loss": 0.5575, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8400, "tokens_per_second_per_gpu": 2443.31 }, { "epoch": 0.8398242460555223, "grad_norm": 2.828125, "learning_rate": 2.2400574509539746e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8410, "tokens_per_second_per_gpu": 2517.69 }, { "epoch": 0.8408228480127821, "grad_norm": 3.0625, "learning_rate": 2.239458425848627e-05, "loss": 0.5573, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8420, "tokens_per_second_per_gpu": 2431.48 }, { "epoch": 0.8418214499700419, "grad_norm": 3.1875, "learning_rate": 2.2388587916144753e-05, "loss": 0.5312, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8430, "tokens_per_second_per_gpu": 2605.73 }, { "epoch": 0.8428200519273018, "grad_norm": 4.0625, "learning_rate": 2.2382585486206656e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8440, "tokens_per_second_per_gpu": 2422.86 }, { "epoch": 0.8438186538845616, "grad_norm": 2.5, "learning_rate": 2.237657697236718e-05, "loss": 0.5238, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8450, "tokens_per_second_per_gpu": 2482.93 }, { "epoch": 0.8448172558418214, "grad_norm": 3.046875, "learning_rate": 2.237056237832528e-05, "loss": 0.5356, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8460, "tokens_per_second_per_gpu": 2447.26 }, { "epoch": 0.8458158577990813, "grad_norm": 2.40625, "learning_rate": 2.236454170778365e-05, "loss": 0.5406, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8470, "tokens_per_second_per_gpu": 2562.36 }, { "epoch": 0.8468144597563412, "grad_norm": 3.328125, "learning_rate": 2.2358514964448727e-05, "loss": 0.5014, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8480, "tokens_per_second_per_gpu": 2476.69 }, { "epoch": 0.8478130617136009, "grad_norm": 3.0625, "learning_rate": 2.2352482152030678e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8490, "tokens_per_second_per_gpu": 2559.22 }, { "epoch": 0.8488116636708608, "grad_norm": 3.046875, "learning_rate": 2.2346443274243428e-05, "loss": 0.4589, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8500, "tokens_per_second_per_gpu": 2378.61 }, { "epoch": 0.8498102656281207, "grad_norm": 2.796875, "learning_rate": 2.2340398334804607e-05, "loss": 0.5238, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8510, "tokens_per_second_per_gpu": 2569.53 }, { "epoch": 0.8508088675853804, "grad_norm": 3.1875, "learning_rate": 2.2334347337435598e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8520, "tokens_per_second_per_gpu": 2400.04 }, { "epoch": 0.8518074695426403, "grad_norm": 3.0625, "learning_rate": 2.2328290285861504e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8530, "tokens_per_second_per_gpu": 2441.67 }, { "epoch": 0.8528060714999002, "grad_norm": 3.109375, "learning_rate": 2.2322227183811156e-05, "loss": 0.4745, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8540, "tokens_per_second_per_gpu": 2390.26 }, { "epoch": 0.8538046734571599, "grad_norm": 3.109375, "learning_rate": 2.231615803501711e-05, "loss": 0.5554, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8550, "tokens_per_second_per_gpu": 2391.85 }, { "epoch": 0.8548032754144198, "grad_norm": 2.921875, "learning_rate": 2.2310082843215654e-05, "loss": 0.5477, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8560, "tokens_per_second_per_gpu": 2475.79 }, { "epoch": 0.8558018773716797, "grad_norm": 2.75, "learning_rate": 2.230400161214678e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8570, "tokens_per_second_per_gpu": 2216.04 }, { "epoch": 0.8568004793289394, "grad_norm": 3.21875, "learning_rate": 2.229791434555421e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8580, "tokens_per_second_per_gpu": 2365.15 }, { "epoch": 0.8577990812861993, "grad_norm": 3.46875, "learning_rate": 2.2291821047185373e-05, "loss": 0.5401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8590, "tokens_per_second_per_gpu": 2466.65 }, { "epoch": 0.8587976832434592, "grad_norm": 2.609375, "learning_rate": 2.228572172079142e-05, "loss": 0.5552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8600, "tokens_per_second_per_gpu": 2427.27 }, { "epoch": 0.859796285200719, "grad_norm": 3.34375, "learning_rate": 2.2279616370127204e-05, "loss": 0.5406, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8610, "tokens_per_second_per_gpu": 2549.08 }, { "epoch": 0.8607948871579788, "grad_norm": 2.90625, "learning_rate": 2.227350499895129e-05, "loss": 0.5832, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8620, "tokens_per_second_per_gpu": 2311.32 }, { "epoch": 0.8617934891152387, "grad_norm": 2.875, "learning_rate": 2.226738761102596e-05, "loss": 0.5782, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8630, "tokens_per_second_per_gpu": 2314.0 }, { "epoch": 0.8627920910724985, "grad_norm": 2.84375, "learning_rate": 2.2261264210117183e-05, "loss": 0.5472, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8640, "tokens_per_second_per_gpu": 2419.36 }, { "epoch": 0.8637906930297583, "grad_norm": 3.203125, "learning_rate": 2.2255134799994636e-05, "loss": 0.4888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8650, "tokens_per_second_per_gpu": 2460.36 }, { "epoch": 0.8647892949870182, "grad_norm": 2.71875, "learning_rate": 2.2248999384431704e-05, "loss": 0.4911, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8660, "tokens_per_second_per_gpu": 2442.51 }, { "epoch": 0.865787896944278, "grad_norm": 3.234375, "learning_rate": 2.2242857967205454e-05, "loss": 0.5802, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8670, "tokens_per_second_per_gpu": 2148.28 }, { "epoch": 0.8667864989015378, "grad_norm": 2.859375, "learning_rate": 2.2236710552096655e-05, "loss": 0.5632, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8680, "tokens_per_second_per_gpu": 2279.43 }, { "epoch": 0.8677851008587977, "grad_norm": 3.484375, "learning_rate": 2.223055714288978e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8690, "tokens_per_second_per_gpu": 2243.76 }, { "epoch": 0.8687837028160575, "grad_norm": 3.6875, "learning_rate": 2.2224397743372966e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8700, "tokens_per_second_per_gpu": 2294.03 }, { "epoch": 0.8697823047733173, "grad_norm": 4.1875, "learning_rate": 2.2218232357338065e-05, "loss": 0.5124, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8710, "tokens_per_second_per_gpu": 2569.88 }, { "epoch": 0.8707809067305772, "grad_norm": 4.5, "learning_rate": 2.221206098858059e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8720, "tokens_per_second_per_gpu": 2492.79 }, { "epoch": 0.871779508687837, "grad_norm": 2.53125, "learning_rate": 2.2205883640899762e-05, "loss": 0.5355, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8730, "tokens_per_second_per_gpu": 2384.89 }, { "epoch": 0.8727781106450969, "grad_norm": 3.53125, "learning_rate": 2.219970031809846e-05, "loss": 0.555, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8740, "tokens_per_second_per_gpu": 2353.52 }, { "epoch": 0.8737767126023567, "grad_norm": 3.328125, "learning_rate": 2.2193511023983254e-05, "loss": 0.5759, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8750, "tokens_per_second_per_gpu": 2463.04 }, { "epoch": 0.8747753145596165, "grad_norm": 3.0, "learning_rate": 2.2187315762364392e-05, "loss": 0.5653, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8760, "tokens_per_second_per_gpu": 2370.01 }, { "epoch": 0.8757739165168764, "grad_norm": 3.125, "learning_rate": 2.2181114537055784e-05, "loss": 0.5424, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8770, "tokens_per_second_per_gpu": 2537.85 }, { "epoch": 0.8767725184741362, "grad_norm": 3.484375, "learning_rate": 2.217490735187502e-05, "loss": 0.5139, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8780, "tokens_per_second_per_gpu": 2351.31 }, { "epoch": 0.877771120431396, "grad_norm": 2.9375, "learning_rate": 2.2168694210643362e-05, "loss": 0.5176, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8790, "tokens_per_second_per_gpu": 2283.67 }, { "epoch": 0.8787697223886559, "grad_norm": 2.75, "learning_rate": 2.216247511718573e-05, "loss": 0.5057, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8800, "tokens_per_second_per_gpu": 2420.84 }, { "epoch": 0.8797683243459157, "grad_norm": 2.578125, "learning_rate": 2.2156250075330712e-05, "loss": 0.4969, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8810, "tokens_per_second_per_gpu": 2627.5 }, { "epoch": 0.8807669263031755, "grad_norm": 3.484375, "learning_rate": 2.2150019088910563e-05, "loss": 0.5622, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8820, "tokens_per_second_per_gpu": 2481.65 }, { "epoch": 0.8817655282604354, "grad_norm": 3.609375, "learning_rate": 2.2143782161761182e-05, "loss": 0.5784, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8830, "tokens_per_second_per_gpu": 2233.78 }, { "epoch": 0.8827641302176952, "grad_norm": 2.75, "learning_rate": 2.213753929772215e-05, "loss": 0.5036, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8840, "tokens_per_second_per_gpu": 2430.47 }, { "epoch": 0.883762732174955, "grad_norm": 3.046875, "learning_rate": 2.2131290500636677e-05, "loss": 0.5815, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8850, "tokens_per_second_per_gpu": 2311.83 }, { "epoch": 0.8847613341322149, "grad_norm": 3.0625, "learning_rate": 2.2125035774351647e-05, "loss": 0.5618, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8860, "tokens_per_second_per_gpu": 2477.31 }, { "epoch": 0.8857599360894748, "grad_norm": 3.5625, "learning_rate": 2.211877512271758e-05, "loss": 0.5482, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8870, "tokens_per_second_per_gpu": 2460.59 }, { "epoch": 0.8867585380467345, "grad_norm": 3.484375, "learning_rate": 2.2112508549588645e-05, "loss": 0.5454, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8880, "tokens_per_second_per_gpu": 2306.97 }, { "epoch": 0.8877571400039944, "grad_norm": 3.109375, "learning_rate": 2.2106236058822664e-05, "loss": 0.5654, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8890, "tokens_per_second_per_gpu": 2512.2 }, { "epoch": 0.8887557419612543, "grad_norm": 3.859375, "learning_rate": 2.2099957654281094e-05, "loss": 0.5326, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8900, "tokens_per_second_per_gpu": 2394.49 }, { "epoch": 0.889754343918514, "grad_norm": 3.015625, "learning_rate": 2.2093673339829044e-05, "loss": 0.5256, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8910, "tokens_per_second_per_gpu": 2531.19 }, { "epoch": 0.8907529458757739, "grad_norm": 3.828125, "learning_rate": 2.2087383119335248e-05, "loss": 0.5619, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8920, "tokens_per_second_per_gpu": 2409.18 }, { "epoch": 0.8917515478330338, "grad_norm": 2.984375, "learning_rate": 2.2081086996672078e-05, "loss": 0.4653, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8930, "tokens_per_second_per_gpu": 2377.94 }, { "epoch": 0.8927501497902935, "grad_norm": 3.234375, "learning_rate": 2.207478497571555e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8940, "tokens_per_second_per_gpu": 2201.82 }, { "epoch": 0.8937487517475534, "grad_norm": 3.265625, "learning_rate": 2.2068477060345298e-05, "loss": 0.5398, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8950, "tokens_per_second_per_gpu": 2527.55 }, { "epoch": 0.8947473537048133, "grad_norm": 2.9375, "learning_rate": 2.2062163254444597e-05, "loss": 0.5552, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8960, "tokens_per_second_per_gpu": 2453.44 }, { "epoch": 0.895745955662073, "grad_norm": 4.0625, "learning_rate": 2.205584356190034e-05, "loss": 0.5452, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8970, "tokens_per_second_per_gpu": 2167.25 }, { "epoch": 0.8967445576193329, "grad_norm": 3.09375, "learning_rate": 2.204951798660304e-05, "loss": 0.5775, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8980, "tokens_per_second_per_gpu": 2394.48 }, { "epoch": 0.8977431595765928, "grad_norm": 3.4375, "learning_rate": 2.2043186532446846e-05, "loss": 0.518, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 8990, "tokens_per_second_per_gpu": 2521.8 }, { "epoch": 0.8987417615338527, "grad_norm": 2.796875, "learning_rate": 2.2036849203329513e-05, "loss": 0.4625, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9000, "tokens_per_second_per_gpu": 2284.03 }, { "epoch": 0.8997403634911124, "grad_norm": 2.96875, "learning_rate": 2.203050600315241e-05, "loss": 0.5177, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9010, "tokens_per_second_per_gpu": 2416.24 }, { "epoch": 0.9007389654483723, "grad_norm": 3.5, "learning_rate": 2.2024156935820548e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9020, "tokens_per_second_per_gpu": 2478.94 }, { "epoch": 0.9017375674056322, "grad_norm": 3.265625, "learning_rate": 2.201780200524251e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9030, "tokens_per_second_per_gpu": 2455.49 }, { "epoch": 0.9027361693628919, "grad_norm": 3.515625, "learning_rate": 2.201144121533052e-05, "loss": 0.525, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9040, "tokens_per_second_per_gpu": 2193.0 }, { "epoch": 0.9037347713201518, "grad_norm": 3.359375, "learning_rate": 2.200507457000039e-05, "loss": 0.4754, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9050, "tokens_per_second_per_gpu": 2320.43 }, { "epoch": 0.9047333732774117, "grad_norm": 3.890625, "learning_rate": 2.1998702073171546e-05, "loss": 0.5258, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9060, "tokens_per_second_per_gpu": 2342.26 }, { "epoch": 0.9057319752346714, "grad_norm": 3.890625, "learning_rate": 2.1992323728767015e-05, "loss": 0.4996, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9070, "tokens_per_second_per_gpu": 2410.03 }, { "epoch": 0.9067305771919313, "grad_norm": 2.59375, "learning_rate": 2.1985939540713425e-05, "loss": 0.5307, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9080, "tokens_per_second_per_gpu": 2339.14 }, { "epoch": 0.9077291791491912, "grad_norm": 2.65625, "learning_rate": 2.1979549512940994e-05, "loss": 0.5273, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9090, "tokens_per_second_per_gpu": 2419.89 }, { "epoch": 0.9087277811064509, "grad_norm": 3.125, "learning_rate": 2.1973153649383545e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9100, "tokens_per_second_per_gpu": 2382.63 }, { "epoch": 0.9097263830637108, "grad_norm": 2.9375, "learning_rate": 2.1966751953978494e-05, "loss": 0.544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9110, "tokens_per_second_per_gpu": 2427.62 }, { "epoch": 0.9107249850209707, "grad_norm": 4.09375, "learning_rate": 2.196034443066683e-05, "loss": 0.5127, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9120, "tokens_per_second_per_gpu": 2397.15 }, { "epoch": 0.9117235869782305, "grad_norm": 2.625, "learning_rate": 2.1953931083393153e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9130, "tokens_per_second_per_gpu": 2357.27 }, { "epoch": 0.9127221889354903, "grad_norm": 2.78125, "learning_rate": 2.194751191610563e-05, "loss": 0.5544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9140, "tokens_per_second_per_gpu": 2321.5 }, { "epoch": 0.9137207908927502, "grad_norm": 3.0, "learning_rate": 2.1941086932756026e-05, "loss": 0.546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9150, "tokens_per_second_per_gpu": 2600.74 }, { "epoch": 0.91471939285001, "grad_norm": 2.3125, "learning_rate": 2.1934656137299675e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9160, "tokens_per_second_per_gpu": 2372.1 }, { "epoch": 0.9157179948072698, "grad_norm": 2.40625, "learning_rate": 2.192821953369549e-05, "loss": 0.4995, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9170, "tokens_per_second_per_gpu": 2437.88 }, { "epoch": 0.9167165967645297, "grad_norm": 3.3125, "learning_rate": 2.1921777125905967e-05, "loss": 0.5347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9180, "tokens_per_second_per_gpu": 2388.22 }, { "epoch": 0.9177151987217895, "grad_norm": 2.78125, "learning_rate": 2.1915328917897167e-05, "loss": 0.4701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9190, "tokens_per_second_per_gpu": 2188.95 }, { "epoch": 0.9187138006790493, "grad_norm": 2.65625, "learning_rate": 2.1908874913638734e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9200, "tokens_per_second_per_gpu": 2365.17 }, { "epoch": 0.9197124026363092, "grad_norm": 2.46875, "learning_rate": 2.1902415117103857e-05, "loss": 0.4752, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9210, "tokens_per_second_per_gpu": 2374.77 }, { "epoch": 0.920711004593569, "grad_norm": 3.390625, "learning_rate": 2.189594953226932e-05, "loss": 0.557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9220, "tokens_per_second_per_gpu": 2277.15 }, { "epoch": 0.9217096065508288, "grad_norm": 2.90625, "learning_rate": 2.1889478163115446e-05, "loss": 0.534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9230, "tokens_per_second_per_gpu": 2457.7 }, { "epoch": 0.9227082085080887, "grad_norm": 2.046875, "learning_rate": 2.1883001013626137e-05, "loss": 0.5348, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9240, "tokens_per_second_per_gpu": 2464.38 }, { "epoch": 0.9237068104653485, "grad_norm": 3.515625, "learning_rate": 2.1876518087788835e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9250, "tokens_per_second_per_gpu": 2340.68 }, { "epoch": 0.9247054124226084, "grad_norm": 3.296875, "learning_rate": 2.187002938959456e-05, "loss": 0.5318, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9260, "tokens_per_second_per_gpu": 2377.95 }, { "epoch": 0.9257040143798682, "grad_norm": 4.0625, "learning_rate": 2.1863534923037865e-05, "loss": 0.5413, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9270, "tokens_per_second_per_gpu": 2344.49 }, { "epoch": 0.926702616337128, "grad_norm": 3.734375, "learning_rate": 2.1857034692116866e-05, "loss": 0.5558, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9280, "tokens_per_second_per_gpu": 2413.81 }, { "epoch": 0.9277012182943879, "grad_norm": 3.640625, "learning_rate": 2.1850528700833228e-05, "loss": 0.5688, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9290, "tokens_per_second_per_gpu": 2539.54 }, { "epoch": 0.9286998202516477, "grad_norm": 2.390625, "learning_rate": 2.1844016953192156e-05, "loss": 0.5166, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9300, "tokens_per_second_per_gpu": 2365.44 }, { "epoch": 0.9296984222089075, "grad_norm": 3.03125, "learning_rate": 2.18374994532024e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9310, "tokens_per_second_per_gpu": 2366.15 }, { "epoch": 0.9306970241661674, "grad_norm": 3.5625, "learning_rate": 2.1830976204876253e-05, "loss": 0.557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9320, "tokens_per_second_per_gpu": 2378.9 }, { "epoch": 0.9316956261234272, "grad_norm": 2.65625, "learning_rate": 2.1824447212229552e-05, "loss": 0.547, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9330, "tokens_per_second_per_gpu": 2542.11 }, { "epoch": 0.932694228080687, "grad_norm": 2.546875, "learning_rate": 2.1817912479281656e-05, "loss": 0.4942, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9340, "tokens_per_second_per_gpu": 2572.56 }, { "epoch": 0.9336928300379469, "grad_norm": 2.3125, "learning_rate": 2.1811372010055476e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9350, "tokens_per_second_per_gpu": 2451.85 }, { "epoch": 0.9346914319952067, "grad_norm": 3.15625, "learning_rate": 2.1804825808577438e-05, "loss": 0.5453, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9360, "tokens_per_second_per_gpu": 2369.45 }, { "epoch": 0.9356900339524665, "grad_norm": 2.46875, "learning_rate": 2.179827387887751e-05, "loss": 0.4714, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9370, "tokens_per_second_per_gpu": 2535.88 }, { "epoch": 0.9366886359097264, "grad_norm": 2.453125, "learning_rate": 2.1791716224989177e-05, "loss": 0.5206, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9380, "tokens_per_second_per_gpu": 2423.01 }, { "epoch": 0.9376872378669863, "grad_norm": 3.796875, "learning_rate": 2.178515285094945e-05, "loss": 0.4871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9390, "tokens_per_second_per_gpu": 2283.89 }, { "epoch": 0.938685839824246, "grad_norm": 3.515625, "learning_rate": 2.1778583760798866e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9400, "tokens_per_second_per_gpu": 2457.73 }, { "epoch": 0.9396844417815059, "grad_norm": 2.421875, "learning_rate": 2.1772008958581473e-05, "loss": 0.5484, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9410, "tokens_per_second_per_gpu": 2243.42 }, { "epoch": 0.9406830437387658, "grad_norm": 3.75, "learning_rate": 2.176542844834485e-05, "loss": 0.4997, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9420, "tokens_per_second_per_gpu": 2249.54 }, { "epoch": 0.9416816456960255, "grad_norm": 2.390625, "learning_rate": 2.1758842234140067e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9430, "tokens_per_second_per_gpu": 2560.77 }, { "epoch": 0.9426802476532854, "grad_norm": 3.59375, "learning_rate": 2.1752250320021728e-05, "loss": 0.4795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9440, "tokens_per_second_per_gpu": 2440.34 }, { "epoch": 0.9436788496105453, "grad_norm": 3.25, "learning_rate": 2.1745652710047933e-05, "loss": 0.5529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9450, "tokens_per_second_per_gpu": 2426.9 }, { "epoch": 0.944677451567805, "grad_norm": 2.671875, "learning_rate": 2.1739049408280293e-05, "loss": 0.5781, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9460, "tokens_per_second_per_gpu": 2385.4 }, { "epoch": 0.9456760535250649, "grad_norm": 2.453125, "learning_rate": 2.173244041878392e-05, "loss": 0.5026, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9470, "tokens_per_second_per_gpu": 2223.65 }, { "epoch": 0.9466746554823248, "grad_norm": 2.78125, "learning_rate": 2.1725825745627433e-05, "loss": 0.5115, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9480, "tokens_per_second_per_gpu": 2264.93 }, { "epoch": 0.9476732574395845, "grad_norm": 2.984375, "learning_rate": 2.1719205392882945e-05, "loss": 0.5663, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9490, "tokens_per_second_per_gpu": 2282.33 }, { "epoch": 0.9486718593968444, "grad_norm": 2.921875, "learning_rate": 2.1712579364626066e-05, "loss": 0.5415, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9500, "tokens_per_second_per_gpu": 2309.61 }, { "epoch": 0.9496704613541043, "grad_norm": 3.46875, "learning_rate": 2.17059476649359e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9510, "tokens_per_second_per_gpu": 2457.7 }, { "epoch": 0.9506690633113641, "grad_norm": 3.109375, "learning_rate": 2.1699310297895045e-05, "loss": 0.5255, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9520, "tokens_per_second_per_gpu": 2374.72 }, { "epoch": 0.9516676652686239, "grad_norm": 2.734375, "learning_rate": 2.169266726758959e-05, "loss": 0.5186, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9530, "tokens_per_second_per_gpu": 2440.49 }, { "epoch": 0.9526662672258838, "grad_norm": 3.703125, "learning_rate": 2.1686018578109095e-05, "loss": 0.5266, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9540, "tokens_per_second_per_gpu": 2157.93 }, { "epoch": 0.9536648691831436, "grad_norm": 3.296875, "learning_rate": 2.1679364233546627e-05, "loss": 0.55, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9550, "tokens_per_second_per_gpu": 2556.39 }, { "epoch": 0.9546634711404034, "grad_norm": 3.34375, "learning_rate": 2.167270423799872e-05, "loss": 0.5439, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9560, "tokens_per_second_per_gpu": 2296.71 }, { "epoch": 0.9556620730976633, "grad_norm": 3.078125, "learning_rate": 2.166603859556539e-05, "loss": 0.5795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9570, "tokens_per_second_per_gpu": 2286.58 }, { "epoch": 0.9566606750549231, "grad_norm": 3.265625, "learning_rate": 2.1659367310350125e-05, "loss": 0.5357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9580, "tokens_per_second_per_gpu": 2493.23 }, { "epoch": 0.9576592770121829, "grad_norm": 2.65625, "learning_rate": 2.1652690386459896e-05, "loss": 0.4871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9590, "tokens_per_second_per_gpu": 2258.97 }, { "epoch": 0.9586578789694428, "grad_norm": 3.71875, "learning_rate": 2.1646007828005133e-05, "loss": 0.5311, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9600, "tokens_per_second_per_gpu": 2364.71 }, { "epoch": 0.9596564809267026, "grad_norm": 3.65625, "learning_rate": 2.1639319639099752e-05, "loss": 0.5891, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9610, "tokens_per_second_per_gpu": 2369.06 }, { "epoch": 0.9606550828839625, "grad_norm": 3.25, "learning_rate": 2.1632625823861116e-05, "loss": 0.5249, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9620, "tokens_per_second_per_gpu": 2427.93 }, { "epoch": 0.9616536848412223, "grad_norm": 3.609375, "learning_rate": 2.1625926386410068e-05, "loss": 0.515, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9630, "tokens_per_second_per_gpu": 2442.08 }, { "epoch": 0.9626522867984821, "grad_norm": 3.40625, "learning_rate": 2.16192213308709e-05, "loss": 0.5134, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9640, "tokens_per_second_per_gpu": 2447.24 }, { "epoch": 0.963650888755742, "grad_norm": 2.171875, "learning_rate": 2.1612510661371366e-05, "loss": 0.4723, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9650, "tokens_per_second_per_gpu": 2490.38 }, { "epoch": 0.9646494907130018, "grad_norm": 2.546875, "learning_rate": 2.1605794382042684e-05, "loss": 0.5397, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9660, "tokens_per_second_per_gpu": 2390.34 }, { "epoch": 0.9656480926702616, "grad_norm": 3.0625, "learning_rate": 2.1599072497019514e-05, "loss": 0.5669, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9670, "tokens_per_second_per_gpu": 2287.19 }, { "epoch": 0.9666466946275215, "grad_norm": 3.703125, "learning_rate": 2.1592345010439972e-05, "loss": 0.5543, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9680, "tokens_per_second_per_gpu": 2241.42 }, { "epoch": 0.9676452965847813, "grad_norm": 4.65625, "learning_rate": 2.1585611926445626e-05, "loss": 0.5612, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9690, "tokens_per_second_per_gpu": 2413.12 }, { "epoch": 0.9686438985420411, "grad_norm": 2.8125, "learning_rate": 2.1578873249181484e-05, "loss": 0.402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9700, "tokens_per_second_per_gpu": 2354.2 }, { "epoch": 0.969642500499301, "grad_norm": 2.53125, "learning_rate": 2.1572128982796e-05, "loss": 0.4903, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9710, "tokens_per_second_per_gpu": 2437.67 }, { "epoch": 0.9706411024565608, "grad_norm": 3.578125, "learning_rate": 2.1565379131441073e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9720, "tokens_per_second_per_gpu": 2311.3 }, { "epoch": 0.9716397044138206, "grad_norm": 2.53125, "learning_rate": 2.1558623699272024e-05, "loss": 0.4885, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9730, "tokens_per_second_per_gpu": 2120.68 }, { "epoch": 0.9726383063710805, "grad_norm": 3.125, "learning_rate": 2.155186269044763e-05, "loss": 0.5378, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9740, "tokens_per_second_per_gpu": 2343.54 }, { "epoch": 0.9736369083283404, "grad_norm": 3.09375, "learning_rate": 2.1545096109130092e-05, "loss": 0.461, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9750, "tokens_per_second_per_gpu": 2352.78 }, { "epoch": 0.9746355102856001, "grad_norm": 3.109375, "learning_rate": 2.153832395948504e-05, "loss": 0.5164, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9760, "tokens_per_second_per_gpu": 2444.0 }, { "epoch": 0.97563411224286, "grad_norm": 3.40625, "learning_rate": 2.1531546245681535e-05, "loss": 0.4671, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9770, "tokens_per_second_per_gpu": 2534.83 }, { "epoch": 0.9766327142001199, "grad_norm": 4.0625, "learning_rate": 2.1524762971892065e-05, "loss": 0.5014, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9780, "tokens_per_second_per_gpu": 2354.77 }, { "epoch": 0.9776313161573796, "grad_norm": 2.515625, "learning_rate": 2.1517974142292528e-05, "loss": 0.5193, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9790, "tokens_per_second_per_gpu": 2571.65 }, { "epoch": 0.9786299181146395, "grad_norm": 2.4375, "learning_rate": 2.1511179761062265e-05, "loss": 0.542, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9800, "tokens_per_second_per_gpu": 2262.65 }, { "epoch": 0.9796285200718994, "grad_norm": 2.953125, "learning_rate": 2.1504379832384013e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9810, "tokens_per_second_per_gpu": 2458.48 }, { "epoch": 0.9806271220291591, "grad_norm": 3.390625, "learning_rate": 2.149757436044394e-05, "loss": 0.5826, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9820, "tokens_per_second_per_gpu": 2483.31 }, { "epoch": 0.981625723986419, "grad_norm": 2.828125, "learning_rate": 2.1490763349431614e-05, "loss": 0.5197, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9830, "tokens_per_second_per_gpu": 2479.53 }, { "epoch": 0.9826243259436789, "grad_norm": 2.578125, "learning_rate": 2.148394680354002e-05, "loss": 0.4671, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9840, "tokens_per_second_per_gpu": 2380.68 }, { "epoch": 0.9836229279009386, "grad_norm": 3.96875, "learning_rate": 2.1477124726965553e-05, "loss": 0.553, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9850, "tokens_per_second_per_gpu": 2403.42 }, { "epoch": 0.9846215298581985, "grad_norm": 3.078125, "learning_rate": 2.1470297123908e-05, "loss": 0.5949, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9860, "tokens_per_second_per_gpu": 2272.79 }, { "epoch": 0.9856201318154584, "grad_norm": 3.25, "learning_rate": 2.1463463998570568e-05, "loss": 0.5343, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9870, "tokens_per_second_per_gpu": 2432.03 }, { "epoch": 0.9866187337727182, "grad_norm": 2.953125, "learning_rate": 2.145662535515985e-05, "loss": 0.5676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9880, "tokens_per_second_per_gpu": 2302.1 }, { "epoch": 0.987617335729978, "grad_norm": 2.796875, "learning_rate": 2.1449781197885843e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9890, "tokens_per_second_per_gpu": 2314.78 }, { "epoch": 0.9886159376872379, "grad_norm": 3.5625, "learning_rate": 2.1442931530961935e-05, "loss": 0.4797, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9900, "tokens_per_second_per_gpu": 2235.2 }, { "epoch": 0.9896145396444977, "grad_norm": 3.734375, "learning_rate": 2.1436076358604907e-05, "loss": 0.5592, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9910, "tokens_per_second_per_gpu": 2326.54 }, { "epoch": 0.9906131416017575, "grad_norm": 2.34375, "learning_rate": 2.1429215685034927e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9920, "tokens_per_second_per_gpu": 2404.87 }, { "epoch": 0.9916117435590174, "grad_norm": 3.234375, "learning_rate": 2.1422349514475558e-05, "loss": 0.526, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9930, "tokens_per_second_per_gpu": 2342.15 }, { "epoch": 0.9926103455162772, "grad_norm": 3.265625, "learning_rate": 2.1415477851153734e-05, "loss": 0.5268, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9940, "tokens_per_second_per_gpu": 2465.9 }, { "epoch": 0.993608947473537, "grad_norm": 3.546875, "learning_rate": 2.140860069929978e-05, "loss": 0.4687, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9950, "tokens_per_second_per_gpu": 2529.89 }, { "epoch": 0.9946075494307969, "grad_norm": 2.578125, "learning_rate": 2.1401718063147395e-05, "loss": 0.5186, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9960, "tokens_per_second_per_gpu": 2420.36 }, { "epoch": 0.9956061513880567, "grad_norm": 2.171875, "learning_rate": 2.1394829946933663e-05, "loss": 0.4581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9970, "tokens_per_second_per_gpu": 2447.72 }, { "epoch": 0.9966047533453165, "grad_norm": 3.25, "learning_rate": 2.138793635489903e-05, "loss": 0.5183, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9980, "tokens_per_second_per_gpu": 2350.04 }, { "epoch": 0.9976033553025764, "grad_norm": 2.390625, "learning_rate": 2.1381037291287318e-05, "loss": 0.493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 9990, "tokens_per_second_per_gpu": 2590.86 }, { "epoch": 0.9986019572598362, "grad_norm": 2.53125, "learning_rate": 2.1374132760345715e-05, "loss": 0.4775, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10000, "tokens_per_second_per_gpu": 2330.85 }, { "epoch": 0.9996005592170961, "grad_norm": 4.625, "learning_rate": 2.1367222766324786e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10010, "tokens_per_second_per_gpu": 2226.06 }, { "epoch": 1.0005991611743559, "grad_norm": 2.34375, "learning_rate": 2.136030731347844e-05, "loss": 0.4756, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10020, "tokens_per_second_per_gpu": 2485.65 }, { "epoch": 1.0015977631316157, "grad_norm": 2.9375, "learning_rate": 2.1353386406063962e-05, "loss": 0.3821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10030, "tokens_per_second_per_gpu": 2410.67 }, { "epoch": 1.0025963650888756, "grad_norm": 3.703125, "learning_rate": 2.1346460048341993e-05, "loss": 0.3844, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10040, "tokens_per_second_per_gpu": 2553.94 }, { "epoch": 1.0035949670461355, "grad_norm": 3.015625, "learning_rate": 2.1339528244576523e-05, "loss": 0.3741, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10050, "tokens_per_second_per_gpu": 2638.91 }, { "epoch": 1.0045935690033951, "grad_norm": 2.640625, "learning_rate": 2.1332590999034896e-05, "loss": 0.3115, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10060, "tokens_per_second_per_gpu": 2509.38 }, { "epoch": 1.005592170960655, "grad_norm": 3.140625, "learning_rate": 2.1325648315987805e-05, "loss": 0.3792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10070, "tokens_per_second_per_gpu": 2604.58 }, { "epoch": 1.0065907729179149, "grad_norm": 3.40625, "learning_rate": 2.13187001997093e-05, "loss": 0.4351, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10080, "tokens_per_second_per_gpu": 2477.32 }, { "epoch": 1.0075893748751747, "grad_norm": 2.8125, "learning_rate": 2.131174665447677e-05, "loss": 0.3877, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10090, "tokens_per_second_per_gpu": 2750.35 }, { "epoch": 1.0085879768324346, "grad_norm": 3.015625, "learning_rate": 2.1304787684570945e-05, "loss": 0.3788, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10100, "tokens_per_second_per_gpu": 2480.64 }, { "epoch": 1.0095865787896945, "grad_norm": 2.90625, "learning_rate": 2.1297823294275886e-05, "loss": 0.4102, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10110, "tokens_per_second_per_gpu": 2562.26 }, { "epoch": 1.0105851807469544, "grad_norm": 2.53125, "learning_rate": 2.1290853487879015e-05, "loss": 0.4066, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10120, "tokens_per_second_per_gpu": 2552.74 }, { "epoch": 1.011583782704214, "grad_norm": 3.640625, "learning_rate": 2.1283878269671056e-05, "loss": 0.4273, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10130, "tokens_per_second_per_gpu": 2578.49 }, { "epoch": 1.0125823846614739, "grad_norm": 3.9375, "learning_rate": 2.1276897643946093e-05, "loss": 0.382, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10140, "tokens_per_second_per_gpu": 2394.58 }, { "epoch": 1.0135809866187337, "grad_norm": 3.125, "learning_rate": 2.1269911615001524e-05, "loss": 0.372, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10150, "tokens_per_second_per_gpu": 2569.02 }, { "epoch": 1.0145795885759936, "grad_norm": 3.078125, "learning_rate": 2.1262920187138073e-05, "loss": 0.3501, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10160, "tokens_per_second_per_gpu": 2518.36 }, { "epoch": 1.0155781905332535, "grad_norm": 3.046875, "learning_rate": 2.1255923364659804e-05, "loss": 0.3756, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10170, "tokens_per_second_per_gpu": 2605.58 }, { "epoch": 1.0165767924905134, "grad_norm": 3.34375, "learning_rate": 2.1248921151874075e-05, "loss": 0.4022, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10180, "tokens_per_second_per_gpu": 2403.47 }, { "epoch": 1.017575394447773, "grad_norm": 2.578125, "learning_rate": 2.1241913553091582e-05, "loss": 0.3811, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10190, "tokens_per_second_per_gpu": 2495.42 }, { "epoch": 1.0185739964050329, "grad_norm": 3.359375, "learning_rate": 2.1234900572626337e-05, "loss": 0.3255, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10200, "tokens_per_second_per_gpu": 2452.39 }, { "epoch": 1.0195725983622927, "grad_norm": 3.0, "learning_rate": 2.1227882214795654e-05, "loss": 0.3567, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10210, "tokens_per_second_per_gpu": 2444.51 }, { "epoch": 1.0205712003195526, "grad_norm": 3.203125, "learning_rate": 2.1220858483920166e-05, "loss": 0.3636, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10220, "tokens_per_second_per_gpu": 2307.21 }, { "epoch": 1.0215698022768125, "grad_norm": 3.421875, "learning_rate": 2.1213829384323808e-05, "loss": 0.3647, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10230, "tokens_per_second_per_gpu": 2589.07 }, { "epoch": 1.0225684042340724, "grad_norm": 3.59375, "learning_rate": 2.120679492033383e-05, "loss": 0.4198, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10240, "tokens_per_second_per_gpu": 2441.03 }, { "epoch": 1.0235670061913322, "grad_norm": 2.5625, "learning_rate": 2.1199755096280765e-05, "loss": 0.3701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10250, "tokens_per_second_per_gpu": 2374.73 }, { "epoch": 1.0245656081485919, "grad_norm": 3.65625, "learning_rate": 2.119270991649847e-05, "loss": 0.3654, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10260, "tokens_per_second_per_gpu": 2548.94 }, { "epoch": 1.0255642101058517, "grad_norm": 3.3125, "learning_rate": 2.118565938532408e-05, "loss": 0.3376, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10270, "tokens_per_second_per_gpu": 2459.32 }, { "epoch": 1.0265628120631116, "grad_norm": 3.078125, "learning_rate": 2.117860350709803e-05, "loss": 0.391, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10280, "tokens_per_second_per_gpu": 2472.67 }, { "epoch": 1.0275614140203715, "grad_norm": 2.96875, "learning_rate": 2.1171542286164064e-05, "loss": 0.3786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10290, "tokens_per_second_per_gpu": 2562.9 }, { "epoch": 1.0285600159776314, "grad_norm": 2.84375, "learning_rate": 2.1164475726869183e-05, "loss": 0.3793, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10300, "tokens_per_second_per_gpu": 2413.79 }, { "epoch": 1.0295586179348912, "grad_norm": 2.703125, "learning_rate": 2.1157403833563694e-05, "loss": 0.3668, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10310, "tokens_per_second_per_gpu": 2632.13 }, { "epoch": 1.030557219892151, "grad_norm": 2.8125, "learning_rate": 2.1150326610601187e-05, "loss": 0.3414, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10320, "tokens_per_second_per_gpu": 2640.21 }, { "epoch": 1.0315558218494107, "grad_norm": 3.1875, "learning_rate": 2.114324406233853e-05, "loss": 0.3792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10330, "tokens_per_second_per_gpu": 2489.73 }, { "epoch": 1.0325544238066706, "grad_norm": 3.234375, "learning_rate": 2.113615619313587e-05, "loss": 0.3765, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10340, "tokens_per_second_per_gpu": 2450.85 }, { "epoch": 1.0335530257639305, "grad_norm": 2.9375, "learning_rate": 2.112906300735663e-05, "loss": 0.3234, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10350, "tokens_per_second_per_gpu": 2453.46 }, { "epoch": 1.0345516277211904, "grad_norm": 2.796875, "learning_rate": 2.112196450936751e-05, "loss": 0.3445, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10360, "tokens_per_second_per_gpu": 2693.76 }, { "epoch": 1.0355502296784502, "grad_norm": 3.21875, "learning_rate": 2.1114860703538465e-05, "loss": 0.3966, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10370, "tokens_per_second_per_gpu": 2466.44 }, { "epoch": 1.03654883163571, "grad_norm": 2.890625, "learning_rate": 2.1107751594242738e-05, "loss": 0.3355, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10380, "tokens_per_second_per_gpu": 2462.78 }, { "epoch": 1.0375474335929697, "grad_norm": 4.15625, "learning_rate": 2.1100637185856827e-05, "loss": 0.3952, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10390, "tokens_per_second_per_gpu": 2437.3 }, { "epoch": 1.0385460355502296, "grad_norm": 3.3125, "learning_rate": 2.1093517482760483e-05, "loss": 0.4243, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10400, "tokens_per_second_per_gpu": 2404.75 }, { "epoch": 1.0395446375074895, "grad_norm": 2.875, "learning_rate": 2.1086392489336738e-05, "loss": 0.3759, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10410, "tokens_per_second_per_gpu": 2410.51 }, { "epoch": 1.0405432394647494, "grad_norm": 2.875, "learning_rate": 2.107926220997186e-05, "loss": 0.4327, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10420, "tokens_per_second_per_gpu": 2532.5 }, { "epoch": 1.0415418414220092, "grad_norm": 3.34375, "learning_rate": 2.1072126649055386e-05, "loss": 0.3774, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10430, "tokens_per_second_per_gpu": 2431.3 }, { "epoch": 1.042540443379269, "grad_norm": 3.578125, "learning_rate": 2.10649858109801e-05, "loss": 0.4126, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10440, "tokens_per_second_per_gpu": 2400.68 }, { "epoch": 1.0435390453365287, "grad_norm": 3.203125, "learning_rate": 2.1057839700142025e-05, "loss": 0.3563, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10450, "tokens_per_second_per_gpu": 2546.33 }, { "epoch": 1.0445376472937886, "grad_norm": 3.203125, "learning_rate": 2.1050688320940447e-05, "loss": 0.3703, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10460, "tokens_per_second_per_gpu": 2335.57 }, { "epoch": 1.0455362492510485, "grad_norm": 3.484375, "learning_rate": 2.104353167777788e-05, "loss": 0.3584, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10470, "tokens_per_second_per_gpu": 2520.62 }, { "epoch": 1.0465348512083084, "grad_norm": 3.4375, "learning_rate": 2.103636977506009e-05, "loss": 0.3826, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10480, "tokens_per_second_per_gpu": 2446.27 }, { "epoch": 1.0475334531655682, "grad_norm": 3.09375, "learning_rate": 2.1029202617196074e-05, "loss": 0.3371, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10490, "tokens_per_second_per_gpu": 2393.9 }, { "epoch": 1.048532055122828, "grad_norm": 3.03125, "learning_rate": 2.102203020859806e-05, "loss": 0.3923, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10500, "tokens_per_second_per_gpu": 2445.86 }, { "epoch": 1.049530657080088, "grad_norm": 2.546875, "learning_rate": 2.1014852553681527e-05, "loss": 0.3713, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10510, "tokens_per_second_per_gpu": 2533.71 }, { "epoch": 1.0505292590373476, "grad_norm": 2.890625, "learning_rate": 2.1007669656865164e-05, "loss": 0.3394, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10520, "tokens_per_second_per_gpu": 2500.74 }, { "epoch": 1.0515278609946075, "grad_norm": 3.09375, "learning_rate": 2.1000481522570896e-05, "loss": 0.3918, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10530, "tokens_per_second_per_gpu": 2563.53 }, { "epoch": 1.0525264629518674, "grad_norm": 2.90625, "learning_rate": 2.0993288155223868e-05, "loss": 0.3635, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10540, "tokens_per_second_per_gpu": 2532.3 }, { "epoch": 1.0535250649091272, "grad_norm": 4.78125, "learning_rate": 2.0986089559252452e-05, "loss": 0.3708, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10550, "tokens_per_second_per_gpu": 2270.27 }, { "epoch": 1.054523666866387, "grad_norm": 2.875, "learning_rate": 2.097888573908824e-05, "loss": 0.3612, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10560, "tokens_per_second_per_gpu": 2444.8 }, { "epoch": 1.055522268823647, "grad_norm": 3.140625, "learning_rate": 2.0971676699166025e-05, "loss": 0.4002, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10570, "tokens_per_second_per_gpu": 2692.28 }, { "epoch": 1.0565208707809068, "grad_norm": 3.421875, "learning_rate": 2.0964462443923837e-05, "loss": 0.3633, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10580, "tokens_per_second_per_gpu": 2502.53 }, { "epoch": 1.0575194727381665, "grad_norm": 3.796875, "learning_rate": 2.0957242977802893e-05, "loss": 0.3816, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10590, "tokens_per_second_per_gpu": 2478.87 }, { "epoch": 1.0585180746954264, "grad_norm": 2.90625, "learning_rate": 2.095001830524764e-05, "loss": 0.4351, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10600, "tokens_per_second_per_gpu": 2647.35 }, { "epoch": 1.0595166766526862, "grad_norm": 3.328125, "learning_rate": 2.0942788430705706e-05, "loss": 0.3763, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10610, "tokens_per_second_per_gpu": 2538.81 }, { "epoch": 1.060515278609946, "grad_norm": 3.515625, "learning_rate": 2.0935553358627947e-05, "loss": 0.3716, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10620, "tokens_per_second_per_gpu": 2579.16 }, { "epoch": 1.061513880567206, "grad_norm": 4.21875, "learning_rate": 2.0928313093468405e-05, "loss": 0.4004, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10630, "tokens_per_second_per_gpu": 2422.37 }, { "epoch": 1.0625124825244658, "grad_norm": 2.796875, "learning_rate": 2.0921067639684315e-05, "loss": 0.374, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10640, "tokens_per_second_per_gpu": 2520.12 }, { "epoch": 1.0635110844817255, "grad_norm": 3.59375, "learning_rate": 2.0913817001736112e-05, "loss": 0.3821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10650, "tokens_per_second_per_gpu": 2452.83 }, { "epoch": 1.0645096864389854, "grad_norm": 2.90625, "learning_rate": 2.090656118408743e-05, "loss": 0.4051, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10660, "tokens_per_second_per_gpu": 2451.73 }, { "epoch": 1.0655082883962452, "grad_norm": 3.484375, "learning_rate": 2.089930019120507e-05, "loss": 0.3469, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10670, "tokens_per_second_per_gpu": 2280.71 }, { "epoch": 1.066506890353505, "grad_norm": 3.875, "learning_rate": 2.0892034027559043e-05, "loss": 0.3593, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10680, "tokens_per_second_per_gpu": 2486.52 }, { "epoch": 1.067505492310765, "grad_norm": 3.0625, "learning_rate": 2.0884762697622535e-05, "loss": 0.3743, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10690, "tokens_per_second_per_gpu": 2422.05 }, { "epoch": 1.0685040942680248, "grad_norm": 2.890625, "learning_rate": 2.08774862058719e-05, "loss": 0.3924, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10700, "tokens_per_second_per_gpu": 2553.78 }, { "epoch": 1.0695026962252845, "grad_norm": 2.890625, "learning_rate": 2.087020455678669e-05, "loss": 0.3186, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10710, "tokens_per_second_per_gpu": 2382.4 }, { "epoch": 1.0705012981825444, "grad_norm": 3.875, "learning_rate": 2.0862917754849616e-05, "loss": 0.417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10720, "tokens_per_second_per_gpu": 2639.88 }, { "epoch": 1.0714999001398042, "grad_norm": 3.265625, "learning_rate": 2.085562580454657e-05, "loss": 0.4074, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10730, "tokens_per_second_per_gpu": 2552.91 }, { "epoch": 1.072498502097064, "grad_norm": 3.40625, "learning_rate": 2.084832871036661e-05, "loss": 0.39, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10740, "tokens_per_second_per_gpu": 2434.46 }, { "epoch": 1.073497104054324, "grad_norm": 2.859375, "learning_rate": 2.0841026476801965e-05, "loss": 0.4252, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10750, "tokens_per_second_per_gpu": 2686.3 }, { "epoch": 1.0744957060115838, "grad_norm": 2.71875, "learning_rate": 2.0833719108348015e-05, "loss": 0.3544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10760, "tokens_per_second_per_gpu": 2581.04 }, { "epoch": 1.0754943079688437, "grad_norm": 2.640625, "learning_rate": 2.082640660950332e-05, "loss": 0.3754, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10770, "tokens_per_second_per_gpu": 2474.54 }, { "epoch": 1.0764929099261034, "grad_norm": 2.625, "learning_rate": 2.0819088984769587e-05, "loss": 0.39, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10780, "tokens_per_second_per_gpu": 2333.38 }, { "epoch": 1.0774915118833632, "grad_norm": 3.421875, "learning_rate": 2.0811766238651675e-05, "loss": 0.4041, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10790, "tokens_per_second_per_gpu": 2640.59 }, { "epoch": 1.078490113840623, "grad_norm": 2.984375, "learning_rate": 2.0804438375657602e-05, "loss": 0.3759, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10800, "tokens_per_second_per_gpu": 2614.4 }, { "epoch": 1.079488715797883, "grad_norm": 2.546875, "learning_rate": 2.0797105400298543e-05, "loss": 0.3279, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10810, "tokens_per_second_per_gpu": 2363.96 }, { "epoch": 1.0804873177551428, "grad_norm": 2.75, "learning_rate": 2.078976731708881e-05, "loss": 0.3981, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10820, "tokens_per_second_per_gpu": 2583.42 }, { "epoch": 1.0814859197124027, "grad_norm": 3.265625, "learning_rate": 2.078242413054585e-05, "loss": 0.4142, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10830, "tokens_per_second_per_gpu": 2524.84 }, { "epoch": 1.0824845216696626, "grad_norm": 2.828125, "learning_rate": 2.077507584519028e-05, "loss": 0.4036, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10840, "tokens_per_second_per_gpu": 2518.34 }, { "epoch": 1.0834831236269222, "grad_norm": 3.265625, "learning_rate": 2.076772246554583e-05, "loss": 0.3966, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10850, "tokens_per_second_per_gpu": 2368.6 }, { "epoch": 1.084481725584182, "grad_norm": 2.3125, "learning_rate": 2.076036399613938e-05, "loss": 0.3766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10860, "tokens_per_second_per_gpu": 2343.13 }, { "epoch": 1.085480327541442, "grad_norm": 3.515625, "learning_rate": 2.0753000441500937e-05, "loss": 0.3495, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10870, "tokens_per_second_per_gpu": 2299.61 }, { "epoch": 1.0864789294987018, "grad_norm": 2.5, "learning_rate": 2.074563180616364e-05, "loss": 0.3467, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10880, "tokens_per_second_per_gpu": 2569.65 }, { "epoch": 1.0874775314559617, "grad_norm": 2.78125, "learning_rate": 2.0738258094663758e-05, "loss": 0.3626, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10890, "tokens_per_second_per_gpu": 2571.69 }, { "epoch": 1.0884761334132216, "grad_norm": 2.828125, "learning_rate": 2.0730879311540684e-05, "loss": 0.3679, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10900, "tokens_per_second_per_gpu": 2519.04 }, { "epoch": 1.0894747353704812, "grad_norm": 4.28125, "learning_rate": 2.0723495461336927e-05, "loss": 0.442, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10910, "tokens_per_second_per_gpu": 2490.02 }, { "epoch": 1.090473337327741, "grad_norm": 3.109375, "learning_rate": 2.0716106548598133e-05, "loss": 0.4164, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10920, "tokens_per_second_per_gpu": 2561.98 }, { "epoch": 1.091471939285001, "grad_norm": 4.25, "learning_rate": 2.070871257787304e-05, "loss": 0.3644, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10930, "tokens_per_second_per_gpu": 2578.14 }, { "epoch": 1.0924705412422608, "grad_norm": 3.21875, "learning_rate": 2.070131355371352e-05, "loss": 0.4305, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10940, "tokens_per_second_per_gpu": 2700.82 }, { "epoch": 1.0934691431995207, "grad_norm": 2.65625, "learning_rate": 2.0693909480674545e-05, "loss": 0.4072, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10950, "tokens_per_second_per_gpu": 2526.59 }, { "epoch": 1.0944677451567806, "grad_norm": 3.109375, "learning_rate": 2.0686500363314196e-05, "loss": 0.3517, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10960, "tokens_per_second_per_gpu": 2573.06 }, { "epoch": 1.0954663471140402, "grad_norm": 2.65625, "learning_rate": 2.0679086206193666e-05, "loss": 0.3661, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10970, "tokens_per_second_per_gpu": 2715.48 }, { "epoch": 1.0964649490713, "grad_norm": 3.640625, "learning_rate": 2.067166701387725e-05, "loss": 0.3936, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10980, "tokens_per_second_per_gpu": 2321.46 }, { "epoch": 1.09746355102856, "grad_norm": 3.46875, "learning_rate": 2.066424279093233e-05, "loss": 0.3627, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 10990, "tokens_per_second_per_gpu": 2623.6 }, { "epoch": 1.0984621529858198, "grad_norm": 3.515625, "learning_rate": 2.0656813541929404e-05, "loss": 0.4099, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11000, "tokens_per_second_per_gpu": 2526.87 }, { "epoch": 1.0994607549430797, "grad_norm": 2.875, "learning_rate": 2.064937927144205e-05, "loss": 0.3461, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11010, "tokens_per_second_per_gpu": 2485.35 }, { "epoch": 1.1004593569003396, "grad_norm": 3.640625, "learning_rate": 2.0641939984046933e-05, "loss": 0.3731, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11020, "tokens_per_second_per_gpu": 2518.93 }, { "epoch": 1.1014579588575995, "grad_norm": 3.234375, "learning_rate": 2.063449568432383e-05, "loss": 0.3653, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11030, "tokens_per_second_per_gpu": 2381.33 }, { "epoch": 1.102456560814859, "grad_norm": 2.953125, "learning_rate": 2.062704637685558e-05, "loss": 0.3393, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11040, "tokens_per_second_per_gpu": 2523.77 }, { "epoch": 1.103455162772119, "grad_norm": 4.21875, "learning_rate": 2.0619592066228117e-05, "loss": 0.4006, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11050, "tokens_per_second_per_gpu": 2464.24 }, { "epoch": 1.1044537647293788, "grad_norm": 4.0, "learning_rate": 2.061213275703045e-05, "loss": 0.3797, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11060, "tokens_per_second_per_gpu": 2435.07 }, { "epoch": 1.1054523666866387, "grad_norm": 3.4375, "learning_rate": 2.0604668453854668e-05, "loss": 0.3935, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11070, "tokens_per_second_per_gpu": 2459.11 }, { "epoch": 1.1064509686438986, "grad_norm": 3.171875, "learning_rate": 2.059719916129593e-05, "loss": 0.3557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11080, "tokens_per_second_per_gpu": 2593.08 }, { "epoch": 1.1074495706011585, "grad_norm": 3.40625, "learning_rate": 2.0589724883952473e-05, "loss": 0.3631, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11090, "tokens_per_second_per_gpu": 2583.35 }, { "epoch": 1.1084481725584183, "grad_norm": 2.640625, "learning_rate": 2.05822456264256e-05, "loss": 0.3801, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11100, "tokens_per_second_per_gpu": 2311.85 }, { "epoch": 1.109446774515678, "grad_norm": 3.234375, "learning_rate": 2.057476139331968e-05, "loss": 0.4028, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11110, "tokens_per_second_per_gpu": 2648.11 }, { "epoch": 1.1104453764729378, "grad_norm": 3.015625, "learning_rate": 2.056727218924214e-05, "loss": 0.3596, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11120, "tokens_per_second_per_gpu": 2455.12 }, { "epoch": 1.1114439784301977, "grad_norm": 3.40625, "learning_rate": 2.055977801880348e-05, "loss": 0.3667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11130, "tokens_per_second_per_gpu": 2489.96 }, { "epoch": 1.1124425803874576, "grad_norm": 3.171875, "learning_rate": 2.055227888661724e-05, "loss": 0.4134, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11140, "tokens_per_second_per_gpu": 2642.2 }, { "epoch": 1.1134411823447175, "grad_norm": 2.59375, "learning_rate": 2.054477479730003e-05, "loss": 0.4068, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11150, "tokens_per_second_per_gpu": 2507.57 }, { "epoch": 1.1144397843019773, "grad_norm": 3.375, "learning_rate": 2.0537265755471503e-05, "loss": 0.3853, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11160, "tokens_per_second_per_gpu": 2595.65 }, { "epoch": 1.115438386259237, "grad_norm": 3.640625, "learning_rate": 2.052975176575436e-05, "loss": 0.3565, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11170, "tokens_per_second_per_gpu": 2450.02 }, { "epoch": 1.1164369882164968, "grad_norm": 2.59375, "learning_rate": 2.0522232832774356e-05, "loss": 0.3698, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11180, "tokens_per_second_per_gpu": 2692.16 }, { "epoch": 1.1174355901737567, "grad_norm": 3.5625, "learning_rate": 2.0514708961160287e-05, "loss": 0.3641, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11190, "tokens_per_second_per_gpu": 2489.54 }, { "epoch": 1.1184341921310166, "grad_norm": 3.59375, "learning_rate": 2.0507180155543978e-05, "loss": 0.4351, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11200, "tokens_per_second_per_gpu": 2628.25 }, { "epoch": 1.1194327940882765, "grad_norm": 3.40625, "learning_rate": 2.049964642056031e-05, "loss": 0.3791, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11210, "tokens_per_second_per_gpu": 2502.54 }, { "epoch": 1.1204313960455363, "grad_norm": 3.96875, "learning_rate": 2.0492107760847178e-05, "loss": 0.445, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11220, "tokens_per_second_per_gpu": 2732.07 }, { "epoch": 1.121429998002796, "grad_norm": 3.796875, "learning_rate": 2.048456418104553e-05, "loss": 0.4343, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11230, "tokens_per_second_per_gpu": 2291.46 }, { "epoch": 1.1224285999600558, "grad_norm": 3.3125, "learning_rate": 2.0477015685799326e-05, "loss": 0.3878, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11240, "tokens_per_second_per_gpu": 2417.01 }, { "epoch": 1.1234272019173157, "grad_norm": 3.25, "learning_rate": 2.046946227975556e-05, "loss": 0.3776, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11250, "tokens_per_second_per_gpu": 2450.72 }, { "epoch": 1.1244258038745756, "grad_norm": 3.296875, "learning_rate": 2.0461903967564248e-05, "loss": 0.3887, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11260, "tokens_per_second_per_gpu": 2502.69 }, { "epoch": 1.1254244058318355, "grad_norm": 3.171875, "learning_rate": 2.0454340753878426e-05, "loss": 0.4002, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11270, "tokens_per_second_per_gpu": 2606.02 }, { "epoch": 1.1264230077890953, "grad_norm": 3.40625, "learning_rate": 2.0446772643354147e-05, "loss": 0.3599, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11280, "tokens_per_second_per_gpu": 2617.9 }, { "epoch": 1.1274216097463552, "grad_norm": 3.546875, "learning_rate": 2.043919964065048e-05, "loss": 0.3883, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11290, "tokens_per_second_per_gpu": 2598.42 }, { "epoch": 1.1284202117036148, "grad_norm": 2.703125, "learning_rate": 2.0431621750429504e-05, "loss": 0.327, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11300, "tokens_per_second_per_gpu": 2564.09 }, { "epoch": 1.1294188136608747, "grad_norm": 3.34375, "learning_rate": 2.042403897735631e-05, "loss": 0.4119, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11310, "tokens_per_second_per_gpu": 2580.92 }, { "epoch": 1.1304174156181346, "grad_norm": 3.390625, "learning_rate": 2.041645132609899e-05, "loss": 0.368, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11320, "tokens_per_second_per_gpu": 2489.46 }, { "epoch": 1.1314160175753945, "grad_norm": 3.046875, "learning_rate": 2.040885880132864e-05, "loss": 0.4276, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11330, "tokens_per_second_per_gpu": 2517.04 }, { "epoch": 1.1324146195326543, "grad_norm": 2.359375, "learning_rate": 2.0401261407719357e-05, "loss": 0.3311, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11340, "tokens_per_second_per_gpu": 2478.82 }, { "epoch": 1.1334132214899142, "grad_norm": 2.9375, "learning_rate": 2.0393659149948246e-05, "loss": 0.3414, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11350, "tokens_per_second_per_gpu": 2591.95 }, { "epoch": 1.134411823447174, "grad_norm": 2.609375, "learning_rate": 2.0386052032695384e-05, "loss": 0.3711, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11360, "tokens_per_second_per_gpu": 2757.89 }, { "epoch": 1.1354104254044337, "grad_norm": 3.84375, "learning_rate": 2.037844006064386e-05, "loss": 0.409, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11370, "tokens_per_second_per_gpu": 2593.42 }, { "epoch": 1.1364090273616936, "grad_norm": 2.8125, "learning_rate": 2.0370823238479746e-05, "loss": 0.3531, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11380, "tokens_per_second_per_gpu": 2502.44 }, { "epoch": 1.1374076293189535, "grad_norm": 3.5, "learning_rate": 2.036320157089209e-05, "loss": 0.3806, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11390, "tokens_per_second_per_gpu": 2537.08 }, { "epoch": 1.1384062312762133, "grad_norm": 3.046875, "learning_rate": 2.0355575062572933e-05, "loss": 0.3631, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11400, "tokens_per_second_per_gpu": 2624.34 }, { "epoch": 1.1394048332334732, "grad_norm": 3.078125, "learning_rate": 2.0347943718217298e-05, "loss": 0.3321, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11410, "tokens_per_second_per_gpu": 2527.84 }, { "epoch": 1.140403435190733, "grad_norm": 4.0, "learning_rate": 2.0340307542523178e-05, "loss": 0.4095, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11420, "tokens_per_second_per_gpu": 2637.34 }, { "epoch": 1.141402037147993, "grad_norm": 3.265625, "learning_rate": 2.033266654019155e-05, "loss": 0.4096, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11430, "tokens_per_second_per_gpu": 2408.25 }, { "epoch": 1.1424006391052526, "grad_norm": 2.53125, "learning_rate": 2.032502071592634e-05, "loss": 0.4145, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11440, "tokens_per_second_per_gpu": 2423.87 }, { "epoch": 1.1433992410625125, "grad_norm": 3.90625, "learning_rate": 2.0317370074434468e-05, "loss": 0.3651, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11450, "tokens_per_second_per_gpu": 2276.69 }, { "epoch": 1.1443978430197723, "grad_norm": 3.59375, "learning_rate": 2.0309714620425814e-05, "loss": 0.3676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11460, "tokens_per_second_per_gpu": 2499.47 }, { "epoch": 1.1453964449770322, "grad_norm": 3.59375, "learning_rate": 2.0302054358613205e-05, "loss": 0.3706, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11470, "tokens_per_second_per_gpu": 2586.36 }, { "epoch": 1.146395046934292, "grad_norm": 2.5, "learning_rate": 2.029438929371245e-05, "loss": 0.359, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11480, "tokens_per_second_per_gpu": 2321.0 }, { "epoch": 1.1473936488915517, "grad_norm": 3.453125, "learning_rate": 2.0286719430442295e-05, "loss": 0.3901, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11490, "tokens_per_second_per_gpu": 2596.12 }, { "epoch": 1.1483922508488116, "grad_norm": 3.375, "learning_rate": 2.0279044773524454e-05, "loss": 0.4074, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11500, "tokens_per_second_per_gpu": 2475.82 }, { "epoch": 1.1493908528060715, "grad_norm": 3.328125, "learning_rate": 2.0271365327683587e-05, "loss": 0.364, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11510, "tokens_per_second_per_gpu": 2545.92 }, { "epoch": 1.1503894547633313, "grad_norm": 2.765625, "learning_rate": 2.02636810976473e-05, "loss": 0.3982, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11520, "tokens_per_second_per_gpu": 2471.37 }, { "epoch": 1.1513880567205912, "grad_norm": 2.625, "learning_rate": 2.0255992088146147e-05, "loss": 0.3636, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11530, "tokens_per_second_per_gpu": 2529.12 }, { "epoch": 1.152386658677851, "grad_norm": 3.09375, "learning_rate": 2.0248298303913628e-05, "loss": 0.4309, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11540, "tokens_per_second_per_gpu": 2677.87 }, { "epoch": 1.153385260635111, "grad_norm": 2.84375, "learning_rate": 2.024059974968617e-05, "loss": 0.3773, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11550, "tokens_per_second_per_gpu": 2579.77 }, { "epoch": 1.1543838625923706, "grad_norm": 2.734375, "learning_rate": 2.0232896430203148e-05, "loss": 0.337, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11560, "tokens_per_second_per_gpu": 2677.88 }, { "epoch": 1.1553824645496305, "grad_norm": 4.1875, "learning_rate": 2.022518835020687e-05, "loss": 0.3852, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11570, "tokens_per_second_per_gpu": 2341.58 }, { "epoch": 1.1563810665068903, "grad_norm": 3.140625, "learning_rate": 2.021747551444257e-05, "loss": 0.4005, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11580, "tokens_per_second_per_gpu": 2625.04 }, { "epoch": 1.1573796684641502, "grad_norm": 2.453125, "learning_rate": 2.020975792765841e-05, "loss": 0.3312, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11590, "tokens_per_second_per_gpu": 2598.25 }, { "epoch": 1.15837827042141, "grad_norm": 3.296875, "learning_rate": 2.0202035594605474e-05, "loss": 0.412, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11600, "tokens_per_second_per_gpu": 2500.75 }, { "epoch": 1.15937687237867, "grad_norm": 2.859375, "learning_rate": 2.0194308520037785e-05, "loss": 0.3422, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11610, "tokens_per_second_per_gpu": 2524.22 }, { "epoch": 1.1603754743359298, "grad_norm": 3.75, "learning_rate": 2.0186576708712262e-05, "loss": 0.447, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11620, "tokens_per_second_per_gpu": 2545.94 }, { "epoch": 1.1613740762931895, "grad_norm": 4.125, "learning_rate": 2.0178840165388753e-05, "loss": 0.3762, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11630, "tokens_per_second_per_gpu": 2560.5 }, { "epoch": 1.1623726782504493, "grad_norm": 2.75, "learning_rate": 2.0171098894830016e-05, "loss": 0.3786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11640, "tokens_per_second_per_gpu": 2608.54 }, { "epoch": 1.1633712802077092, "grad_norm": 3.140625, "learning_rate": 2.016335290180172e-05, "loss": 0.4217, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11650, "tokens_per_second_per_gpu": 2437.49 }, { "epoch": 1.164369882164969, "grad_norm": 3.25, "learning_rate": 2.015560219107244e-05, "loss": 0.3945, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11660, "tokens_per_second_per_gpu": 2586.77 }, { "epoch": 1.165368484122229, "grad_norm": 3.8125, "learning_rate": 2.0147846767413662e-05, "loss": 0.3557, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11670, "tokens_per_second_per_gpu": 2425.43 }, { "epoch": 1.1663670860794888, "grad_norm": 3.390625, "learning_rate": 2.014008663559976e-05, "loss": 0.409, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11680, "tokens_per_second_per_gpu": 2547.59 }, { "epoch": 1.1673656880367487, "grad_norm": 3.78125, "learning_rate": 2.013232180040801e-05, "loss": 0.4389, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11690, "tokens_per_second_per_gpu": 2536.66 }, { "epoch": 1.1683642899940083, "grad_norm": 2.953125, "learning_rate": 2.01245522666186e-05, "loss": 0.3664, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11700, "tokens_per_second_per_gpu": 2349.73 }, { "epoch": 1.1693628919512682, "grad_norm": 2.6875, "learning_rate": 2.0116778039014592e-05, "loss": 0.3896, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11710, "tokens_per_second_per_gpu": 2623.81 }, { "epoch": 1.170361493908528, "grad_norm": 3.0625, "learning_rate": 2.010899912238194e-05, "loss": 0.369, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11720, "tokens_per_second_per_gpu": 2437.79 }, { "epoch": 1.171360095865788, "grad_norm": 3.625, "learning_rate": 2.0101215521509493e-05, "loss": 0.4155, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11730, "tokens_per_second_per_gpu": 2578.04 }, { "epoch": 1.1723586978230478, "grad_norm": 3.296875, "learning_rate": 2.0093427241188973e-05, "loss": 0.3504, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11740, "tokens_per_second_per_gpu": 2560.81 }, { "epoch": 1.1733572997803075, "grad_norm": 2.890625, "learning_rate": 2.0085634286214994e-05, "loss": 0.3253, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11750, "tokens_per_second_per_gpu": 2434.59 }, { "epoch": 1.1743559017375673, "grad_norm": 2.953125, "learning_rate": 2.007783666138504e-05, "loss": 0.4153, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11760, "tokens_per_second_per_gpu": 2658.5 }, { "epoch": 1.1753545036948272, "grad_norm": 3.609375, "learning_rate": 2.0070034371499478e-05, "loss": 0.3487, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11770, "tokens_per_second_per_gpu": 2557.66 }, { "epoch": 1.176353105652087, "grad_norm": 4.40625, "learning_rate": 2.0062227421361533e-05, "loss": 0.4175, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11780, "tokens_per_second_per_gpu": 2538.28 }, { "epoch": 1.177351707609347, "grad_norm": 3.390625, "learning_rate": 2.005441581577731e-05, "loss": 0.3631, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11790, "tokens_per_second_per_gpu": 2436.27 }, { "epoch": 1.1783503095666068, "grad_norm": 3.3125, "learning_rate": 2.004659955955578e-05, "loss": 0.3427, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11800, "tokens_per_second_per_gpu": 2414.53 }, { "epoch": 1.1793489115238667, "grad_norm": 3.15625, "learning_rate": 2.0038778657508768e-05, "loss": 0.3972, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11810, "tokens_per_second_per_gpu": 2447.28 }, { "epoch": 1.1803475134811263, "grad_norm": 2.609375, "learning_rate": 2.0030953114450966e-05, "loss": 0.3781, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11820, "tokens_per_second_per_gpu": 2546.67 }, { "epoch": 1.1813461154383862, "grad_norm": 2.609375, "learning_rate": 2.002312293519993e-05, "loss": 0.3934, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11830, "tokens_per_second_per_gpu": 2465.55 }, { "epoch": 1.182344717395646, "grad_norm": 2.359375, "learning_rate": 2.0015288124576058e-05, "loss": 0.3514, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11840, "tokens_per_second_per_gpu": 2421.48 }, { "epoch": 1.183343319352906, "grad_norm": 3.453125, "learning_rate": 2.0007448687402597e-05, "loss": 0.3534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11850, "tokens_per_second_per_gpu": 2547.46 }, { "epoch": 1.1843419213101658, "grad_norm": 3.65625, "learning_rate": 1.999960462850566e-05, "loss": 0.3913, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11860, "tokens_per_second_per_gpu": 2443.77 }, { "epoch": 1.1853405232674257, "grad_norm": 3.328125, "learning_rate": 1.9991755952714186e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11870, "tokens_per_second_per_gpu": 2559.85 }, { "epoch": 1.1863391252246855, "grad_norm": 3.609375, "learning_rate": 1.9983902664859964e-05, "loss": 0.3603, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11880, "tokens_per_second_per_gpu": 2741.61 }, { "epoch": 1.1873377271819452, "grad_norm": 3.21875, "learning_rate": 1.997604476977763e-05, "loss": 0.3719, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11890, "tokens_per_second_per_gpu": 2636.32 }, { "epoch": 1.188336329139205, "grad_norm": 3.296875, "learning_rate": 1.996818227230464e-05, "loss": 0.3257, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11900, "tokens_per_second_per_gpu": 2349.74 }, { "epoch": 1.189334931096465, "grad_norm": 3.40625, "learning_rate": 1.9960315177281296e-05, "loss": 0.3542, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11910, "tokens_per_second_per_gpu": 2527.22 }, { "epoch": 1.1903335330537248, "grad_norm": 3.8125, "learning_rate": 1.9952443489550725e-05, "loss": 0.3727, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11920, "tokens_per_second_per_gpu": 2713.58 }, { "epoch": 1.1913321350109847, "grad_norm": 3.125, "learning_rate": 1.9944567213958888e-05, "loss": 0.394, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11930, "tokens_per_second_per_gpu": 2575.67 }, { "epoch": 1.1923307369682445, "grad_norm": 3.484375, "learning_rate": 1.993668635535456e-05, "loss": 0.4142, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11940, "tokens_per_second_per_gpu": 2577.41 }, { "epoch": 1.1933293389255044, "grad_norm": 3.28125, "learning_rate": 1.992880091858934e-05, "loss": 0.3472, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11950, "tokens_per_second_per_gpu": 2342.02 }, { "epoch": 1.194327940882764, "grad_norm": 2.96875, "learning_rate": 1.9920910908517654e-05, "loss": 0.3285, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11960, "tokens_per_second_per_gpu": 2452.88 }, { "epoch": 1.195326542840024, "grad_norm": 3.578125, "learning_rate": 1.9913016329996732e-05, "loss": 0.3936, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11970, "tokens_per_second_per_gpu": 2507.86 }, { "epoch": 1.1963251447972838, "grad_norm": 3.125, "learning_rate": 1.9905117187886622e-05, "loss": 0.3871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11980, "tokens_per_second_per_gpu": 2699.43 }, { "epoch": 1.1973237467545437, "grad_norm": 3.484375, "learning_rate": 1.989721348705019e-05, "loss": 0.4131, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 11990, "tokens_per_second_per_gpu": 2421.05 }, { "epoch": 1.1983223487118035, "grad_norm": 2.96875, "learning_rate": 1.988930523235309e-05, "loss": 0.384, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12000, "tokens_per_second_per_gpu": 2369.98 }, { "epoch": 1.1993209506690632, "grad_norm": 2.984375, "learning_rate": 1.98813924286638e-05, "loss": 0.3826, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12010, "tokens_per_second_per_gpu": 2611.23 }, { "epoch": 1.200319552626323, "grad_norm": 4.09375, "learning_rate": 1.9873475080853568e-05, "loss": 0.4208, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12020, "tokens_per_second_per_gpu": 2632.17 }, { "epoch": 1.201318154583583, "grad_norm": 3.34375, "learning_rate": 1.9865553193796473e-05, "loss": 0.3592, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12030, "tokens_per_second_per_gpu": 2476.77 }, { "epoch": 1.2023167565408428, "grad_norm": 4.34375, "learning_rate": 1.9857626772369377e-05, "loss": 0.4046, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12040, "tokens_per_second_per_gpu": 2362.01 }, { "epoch": 1.2033153584981027, "grad_norm": 3.265625, "learning_rate": 1.984969582145192e-05, "loss": 0.3601, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12050, "tokens_per_second_per_gpu": 2484.28 }, { "epoch": 1.2043139604553625, "grad_norm": 3.359375, "learning_rate": 1.9841760345926546e-05, "loss": 0.4243, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12060, "tokens_per_second_per_gpu": 2397.43 }, { "epoch": 1.2053125624126224, "grad_norm": 3.125, "learning_rate": 1.9833820350678485e-05, "loss": 0.3798, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12070, "tokens_per_second_per_gpu": 2499.29 }, { "epoch": 1.206311164369882, "grad_norm": 2.984375, "learning_rate": 1.982587584059574e-05, "loss": 0.3906, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12080, "tokens_per_second_per_gpu": 2369.19 }, { "epoch": 1.207309766327142, "grad_norm": 3.296875, "learning_rate": 1.9817926820569093e-05, "loss": 0.4203, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12090, "tokens_per_second_per_gpu": 2447.23 }, { "epoch": 1.2083083682844018, "grad_norm": 3.015625, "learning_rate": 1.9809973295492117e-05, "loss": 0.3454, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12100, "tokens_per_second_per_gpu": 2507.87 }, { "epoch": 1.2093069702416617, "grad_norm": 3.4375, "learning_rate": 1.9802015270261138e-05, "loss": 0.3983, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12110, "tokens_per_second_per_gpu": 2522.79 }, { "epoch": 1.2103055721989215, "grad_norm": 3.296875, "learning_rate": 1.9794052749775265e-05, "loss": 0.3946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12120, "tokens_per_second_per_gpu": 2385.53 }, { "epoch": 1.2113041741561814, "grad_norm": 3.921875, "learning_rate": 1.978608573893638e-05, "loss": 0.3605, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12130, "tokens_per_second_per_gpu": 2533.79 }, { "epoch": 1.2123027761134413, "grad_norm": 4.09375, "learning_rate": 1.977811424264911e-05, "loss": 0.3831, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12140, "tokens_per_second_per_gpu": 2388.3 }, { "epoch": 1.213301378070701, "grad_norm": 3.28125, "learning_rate": 1.9770138265820863e-05, "loss": 0.4178, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12150, "tokens_per_second_per_gpu": 2582.95 }, { "epoch": 1.2142999800279608, "grad_norm": 3.828125, "learning_rate": 1.97621578133618e-05, "loss": 0.3467, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12160, "tokens_per_second_per_gpu": 2583.34 }, { "epoch": 1.2152985819852207, "grad_norm": 3.328125, "learning_rate": 1.975417289018482e-05, "loss": 0.3537, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12170, "tokens_per_second_per_gpu": 2463.0 }, { "epoch": 1.2162971839424805, "grad_norm": 3.734375, "learning_rate": 1.9746183501205598e-05, "loss": 0.3984, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12180, "tokens_per_second_per_gpu": 2424.35 }, { "epoch": 1.2172957858997404, "grad_norm": 3.984375, "learning_rate": 1.9738189651342553e-05, "loss": 0.4024, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12190, "tokens_per_second_per_gpu": 2497.42 }, { "epoch": 1.2182943878570003, "grad_norm": 2.59375, "learning_rate": 1.9730191345516836e-05, "loss": 0.3702, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12200, "tokens_per_second_per_gpu": 2654.54 }, { "epoch": 1.2192929898142602, "grad_norm": 3.53125, "learning_rate": 1.9722188588652356e-05, "loss": 0.3358, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12210, "tokens_per_second_per_gpu": 2431.5 }, { "epoch": 1.2202915917715198, "grad_norm": 3.015625, "learning_rate": 1.971418138567576e-05, "loss": 0.3658, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12220, "tokens_per_second_per_gpu": 2449.86 }, { "epoch": 1.2212901937287797, "grad_norm": 2.265625, "learning_rate": 1.970616974151643e-05, "loss": 0.3403, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12230, "tokens_per_second_per_gpu": 2307.03 }, { "epoch": 1.2222887956860395, "grad_norm": 3.703125, "learning_rate": 1.9698153661106473e-05, "loss": 0.3816, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12240, "tokens_per_second_per_gpu": 2521.98 }, { "epoch": 1.2232873976432994, "grad_norm": 3.3125, "learning_rate": 1.9690133149380742e-05, "loss": 0.4098, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12250, "tokens_per_second_per_gpu": 2618.57 }, { "epoch": 1.2242859996005593, "grad_norm": 2.25, "learning_rate": 1.9682108211276808e-05, "loss": 0.3402, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12260, "tokens_per_second_per_gpu": 2363.35 }, { "epoch": 1.225284601557819, "grad_norm": 4.0, "learning_rate": 1.9674078851734978e-05, "loss": 0.3263, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12270, "tokens_per_second_per_gpu": 2531.37 }, { "epoch": 1.2262832035150788, "grad_norm": 3.6875, "learning_rate": 1.9666045075698265e-05, "loss": 0.3738, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12280, "tokens_per_second_per_gpu": 2461.55 }, { "epoch": 1.2272818054723387, "grad_norm": 3.03125, "learning_rate": 1.9658006888112412e-05, "loss": 0.3872, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12290, "tokens_per_second_per_gpu": 2490.07 }, { "epoch": 1.2282804074295985, "grad_norm": 3.78125, "learning_rate": 1.9649964293925875e-05, "loss": 0.3795, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12300, "tokens_per_second_per_gpu": 2548.95 }, { "epoch": 1.2292790093868584, "grad_norm": 2.59375, "learning_rate": 1.9641917298089826e-05, "loss": 0.3598, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12310, "tokens_per_second_per_gpu": 2615.07 }, { "epoch": 1.2302776113441183, "grad_norm": 2.71875, "learning_rate": 1.963386590555814e-05, "loss": 0.3856, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12320, "tokens_per_second_per_gpu": 2654.05 }, { "epoch": 1.2312762133013782, "grad_norm": 3.21875, "learning_rate": 1.962581012128741e-05, "loss": 0.4281, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12330, "tokens_per_second_per_gpu": 2429.29 }, { "epoch": 1.2322748152586378, "grad_norm": 2.90625, "learning_rate": 1.9617749950236906e-05, "loss": 0.3792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12340, "tokens_per_second_per_gpu": 2331.24 }, { "epoch": 1.2332734172158977, "grad_norm": 3.75, "learning_rate": 1.9609685397368636e-05, "loss": 0.3907, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12350, "tokens_per_second_per_gpu": 2449.37 }, { "epoch": 1.2342720191731575, "grad_norm": 4.3125, "learning_rate": 1.9601616467647282e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12360, "tokens_per_second_per_gpu": 2437.56 }, { "epoch": 1.2352706211304174, "grad_norm": 3.640625, "learning_rate": 1.9593543166040222e-05, "loss": 0.3335, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12370, "tokens_per_second_per_gpu": 2518.86 }, { "epoch": 1.2362692230876773, "grad_norm": 3.359375, "learning_rate": 1.9585465497517536e-05, "loss": 0.3533, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12380, "tokens_per_second_per_gpu": 2401.19 }, { "epoch": 1.2372678250449372, "grad_norm": 3.53125, "learning_rate": 1.957738346705198e-05, "loss": 0.3719, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12390, "tokens_per_second_per_gpu": 2389.02 }, { "epoch": 1.238266427002197, "grad_norm": 2.953125, "learning_rate": 1.9569297079618997e-05, "loss": 0.3994, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12400, "tokens_per_second_per_gpu": 2492.77 }, { "epoch": 1.2392650289594567, "grad_norm": 2.515625, "learning_rate": 1.956120634019672e-05, "loss": 0.4209, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12410, "tokens_per_second_per_gpu": 2474.85 }, { "epoch": 1.2402636309167165, "grad_norm": 3.34375, "learning_rate": 1.9553111253765963e-05, "loss": 0.3419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12420, "tokens_per_second_per_gpu": 2461.31 }, { "epoch": 1.2412622328739764, "grad_norm": 3.671875, "learning_rate": 1.95450118253102e-05, "loss": 0.3986, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12430, "tokens_per_second_per_gpu": 2366.39 }, { "epoch": 1.2422608348312363, "grad_norm": 2.828125, "learning_rate": 1.9536908059815594e-05, "loss": 0.3934, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12440, "tokens_per_second_per_gpu": 2580.87 }, { "epoch": 1.2432594367884962, "grad_norm": 3.171875, "learning_rate": 1.9528799962270972e-05, "loss": 0.393, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12450, "tokens_per_second_per_gpu": 2476.25 }, { "epoch": 1.244258038745756, "grad_norm": 3.34375, "learning_rate": 1.952068753766782e-05, "loss": 0.3963, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12460, "tokens_per_second_per_gpu": 2296.11 }, { "epoch": 1.245256640703016, "grad_norm": 2.953125, "learning_rate": 1.9512570791000306e-05, "loss": 0.3903, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12470, "tokens_per_second_per_gpu": 2375.2 }, { "epoch": 1.2462552426602755, "grad_norm": 4.375, "learning_rate": 1.9504449727265246e-05, "loss": 0.3919, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12480, "tokens_per_second_per_gpu": 2347.75 }, { "epoch": 1.2472538446175354, "grad_norm": 3.515625, "learning_rate": 1.9496324351462116e-05, "loss": 0.3282, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12490, "tokens_per_second_per_gpu": 2451.43 }, { "epoch": 1.2482524465747953, "grad_norm": 3.390625, "learning_rate": 1.9488194668593044e-05, "loss": 0.3572, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12500, "tokens_per_second_per_gpu": 2503.64 }, { "epoch": 1.2492510485320552, "grad_norm": 3.28125, "learning_rate": 1.9480060683662813e-05, "loss": 0.3386, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12510, "tokens_per_second_per_gpu": 2346.35 }, { "epoch": 1.250249650489315, "grad_norm": 4.09375, "learning_rate": 1.9471922401678856e-05, "loss": 0.3793, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12520, "tokens_per_second_per_gpu": 2339.51 }, { "epoch": 1.2512482524465747, "grad_norm": 3.59375, "learning_rate": 1.9463779827651245e-05, "loss": 0.4025, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12530, "tokens_per_second_per_gpu": 2297.27 }, { "epoch": 1.2522468544038348, "grad_norm": 2.953125, "learning_rate": 1.9455632966592703e-05, "loss": 0.3928, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12540, "tokens_per_second_per_gpu": 2524.51 }, { "epoch": 1.2532454563610944, "grad_norm": 2.375, "learning_rate": 1.944748182351859e-05, "loss": 0.3686, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12550, "tokens_per_second_per_gpu": 2521.32 }, { "epoch": 1.2542440583183543, "grad_norm": 3.515625, "learning_rate": 1.943932640344689e-05, "loss": 0.3676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12560, "tokens_per_second_per_gpu": 2414.97 }, { "epoch": 1.2552426602756142, "grad_norm": 3.484375, "learning_rate": 1.9431166711398234e-05, "loss": 0.3635, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12570, "tokens_per_second_per_gpu": 2427.54 }, { "epoch": 1.256241262232874, "grad_norm": 3.0625, "learning_rate": 1.942300275239588e-05, "loss": 0.3444, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12580, "tokens_per_second_per_gpu": 2360.92 }, { "epoch": 1.257239864190134, "grad_norm": 3.125, "learning_rate": 1.9414834531465712e-05, "loss": 0.3798, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12590, "tokens_per_second_per_gpu": 2686.79 }, { "epoch": 1.2582384661473935, "grad_norm": 2.9375, "learning_rate": 1.940666205363623e-05, "loss": 0.3988, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12600, "tokens_per_second_per_gpu": 2389.7 }, { "epoch": 1.2592370681046534, "grad_norm": 3.171875, "learning_rate": 1.9398485323938575e-05, "loss": 0.362, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12610, "tokens_per_second_per_gpu": 2307.95 }, { "epoch": 1.2602356700619133, "grad_norm": 3.5, "learning_rate": 1.9390304347406485e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12620, "tokens_per_second_per_gpu": 2619.68 }, { "epoch": 1.2612342720191732, "grad_norm": 4.15625, "learning_rate": 1.9382119129076312e-05, "loss": 0.3932, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12630, "tokens_per_second_per_gpu": 2450.81 }, { "epoch": 1.262232873976433, "grad_norm": 3.78125, "learning_rate": 1.9373929673987037e-05, "loss": 0.3585, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12640, "tokens_per_second_per_gpu": 2462.93 }, { "epoch": 1.263231475933693, "grad_norm": 3.953125, "learning_rate": 1.936573598718024e-05, "loss": 0.4196, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12650, "tokens_per_second_per_gpu": 2603.94 }, { "epoch": 1.2642300778909528, "grad_norm": 3.515625, "learning_rate": 1.9357538073700092e-05, "loss": 0.3967, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12660, "tokens_per_second_per_gpu": 2602.66 }, { "epoch": 1.2652286798482124, "grad_norm": 3.109375, "learning_rate": 1.93493359385934e-05, "loss": 0.3779, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12670, "tokens_per_second_per_gpu": 2480.45 }, { "epoch": 1.2662272818054723, "grad_norm": 2.125, "learning_rate": 1.9341129586909532e-05, "loss": 0.386, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12680, "tokens_per_second_per_gpu": 2485.09 }, { "epoch": 1.2672258837627322, "grad_norm": 3.40625, "learning_rate": 1.9332919023700474e-05, "loss": 0.3976, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12690, "tokens_per_second_per_gpu": 2536.58 }, { "epoch": 1.268224485719992, "grad_norm": 3.109375, "learning_rate": 1.9324704254020802e-05, "loss": 0.3673, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12700, "tokens_per_second_per_gpu": 2557.09 }, { "epoch": 1.269223087677252, "grad_norm": 3.984375, "learning_rate": 1.931648528292768e-05, "loss": 0.3992, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12710, "tokens_per_second_per_gpu": 2561.06 }, { "epoch": 1.2702216896345118, "grad_norm": 3.6875, "learning_rate": 1.9308262115480852e-05, "loss": 0.3766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12720, "tokens_per_second_per_gpu": 2502.48 }, { "epoch": 1.2712202915917716, "grad_norm": 3.25, "learning_rate": 1.930003475674266e-05, "loss": 0.3811, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12730, "tokens_per_second_per_gpu": 2621.8 }, { "epoch": 1.2722188935490313, "grad_norm": 3.125, "learning_rate": 1.9291803211778005e-05, "loss": 0.3865, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12740, "tokens_per_second_per_gpu": 2526.52 }, { "epoch": 1.2732174955062912, "grad_norm": 3.234375, "learning_rate": 1.928356748565439e-05, "loss": 0.3306, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12750, "tokens_per_second_per_gpu": 2509.0 }, { "epoch": 1.274216097463551, "grad_norm": 3.21875, "learning_rate": 1.927532758344187e-05, "loss": 0.3809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12760, "tokens_per_second_per_gpu": 2450.5 }, { "epoch": 1.275214699420811, "grad_norm": 4.25, "learning_rate": 1.926708351021309e-05, "loss": 0.3631, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12770, "tokens_per_second_per_gpu": 2522.14 }, { "epoch": 1.2762133013780708, "grad_norm": 3.296875, "learning_rate": 1.925883527104324e-05, "loss": 0.406, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12780, "tokens_per_second_per_gpu": 2414.96 }, { "epoch": 1.2772119033353304, "grad_norm": 3.46875, "learning_rate": 1.9250582871010104e-05, "loss": 0.3832, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12790, "tokens_per_second_per_gpu": 2507.72 }, { "epoch": 1.2782105052925905, "grad_norm": 3.59375, "learning_rate": 1.9242326315194e-05, "loss": 0.3585, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12800, "tokens_per_second_per_gpu": 2487.16 }, { "epoch": 1.2792091072498502, "grad_norm": 3.5625, "learning_rate": 1.9234065608677812e-05, "loss": 0.3769, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12810, "tokens_per_second_per_gpu": 2544.37 }, { "epoch": 1.28020770920711, "grad_norm": 3.046875, "learning_rate": 1.9225800756546994e-05, "loss": 0.3898, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12820, "tokens_per_second_per_gpu": 2551.35 }, { "epoch": 1.28120631116437, "grad_norm": 2.59375, "learning_rate": 1.9217531763889533e-05, "loss": 0.3136, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12830, "tokens_per_second_per_gpu": 2350.06 }, { "epoch": 1.2822049131216298, "grad_norm": 2.65625, "learning_rate": 1.920925863579598e-05, "loss": 0.3604, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12840, "tokens_per_second_per_gpu": 2511.66 }, { "epoch": 1.2832035150788896, "grad_norm": 3.015625, "learning_rate": 1.9200981377359418e-05, "loss": 0.3504, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12850, "tokens_per_second_per_gpu": 2540.24 }, { "epoch": 1.2842021170361493, "grad_norm": 3.75, "learning_rate": 1.9192699993675483e-05, "loss": 0.3937, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12860, "tokens_per_second_per_gpu": 2404.07 }, { "epoch": 1.2852007189934092, "grad_norm": 2.671875, "learning_rate": 1.9184414489842345e-05, "loss": 0.3768, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12870, "tokens_per_second_per_gpu": 2472.04 }, { "epoch": 1.286199320950669, "grad_norm": 3.5, "learning_rate": 1.9176124870960717e-05, "loss": 0.4667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12880, "tokens_per_second_per_gpu": 2440.02 }, { "epoch": 1.287197922907929, "grad_norm": 2.515625, "learning_rate": 1.9167831142133834e-05, "loss": 0.3272, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12890, "tokens_per_second_per_gpu": 2432.44 }, { "epoch": 1.2881965248651888, "grad_norm": 3.03125, "learning_rate": 1.915953330846747e-05, "loss": 0.398, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12900, "tokens_per_second_per_gpu": 2403.93 }, { "epoch": 1.2891951268224486, "grad_norm": 4.6875, "learning_rate": 1.9151231375069935e-05, "loss": 0.4524, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12910, "tokens_per_second_per_gpu": 2567.85 }, { "epoch": 1.2901937287797085, "grad_norm": 3.359375, "learning_rate": 1.9142925347052036e-05, "loss": 0.3786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12920, "tokens_per_second_per_gpu": 2454.19 }, { "epoch": 1.2911923307369682, "grad_norm": 2.703125, "learning_rate": 1.913461522952712e-05, "loss": 0.3214, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12930, "tokens_per_second_per_gpu": 2612.12 }, { "epoch": 1.292190932694228, "grad_norm": 3.3125, "learning_rate": 1.9126301027611058e-05, "loss": 0.3876, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12940, "tokens_per_second_per_gpu": 2359.78 }, { "epoch": 1.293189534651488, "grad_norm": 2.65625, "learning_rate": 1.9117982746422214e-05, "loss": 0.3709, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12950, "tokens_per_second_per_gpu": 2429.56 }, { "epoch": 1.2941881366087478, "grad_norm": 3.1875, "learning_rate": 1.9109660391081483e-05, "loss": 0.4212, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12960, "tokens_per_second_per_gpu": 2623.09 }, { "epoch": 1.2951867385660076, "grad_norm": 3.34375, "learning_rate": 1.9101333966712255e-05, "loss": 0.3885, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12970, "tokens_per_second_per_gpu": 2588.93 }, { "epoch": 1.2961853405232675, "grad_norm": 2.40625, "learning_rate": 1.9093003478440434e-05, "loss": 0.3609, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12980, "tokens_per_second_per_gpu": 2523.31 }, { "epoch": 1.2971839424805274, "grad_norm": 3.109375, "learning_rate": 1.9084668931394416e-05, "loss": 0.3779, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 12990, "tokens_per_second_per_gpu": 2389.39 }, { "epoch": 1.298182544437787, "grad_norm": 3.75, "learning_rate": 1.907633033070511e-05, "loss": 0.4025, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13000, "tokens_per_second_per_gpu": 2563.05 }, { "epoch": 1.299181146395047, "grad_norm": 3.046875, "learning_rate": 1.90679876815059e-05, "loss": 0.4082, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13010, "tokens_per_second_per_gpu": 2503.0 }, { "epoch": 1.3001797483523068, "grad_norm": 3.171875, "learning_rate": 1.9059640988932686e-05, "loss": 0.3692, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13020, "tokens_per_second_per_gpu": 2552.47 }, { "epoch": 1.3011783503095666, "grad_norm": 3.59375, "learning_rate": 1.9051290258123842e-05, "loss": 0.4099, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13030, "tokens_per_second_per_gpu": 2343.75 }, { "epoch": 1.3021769522668265, "grad_norm": 2.53125, "learning_rate": 1.9042935494220227e-05, "loss": 0.3427, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13040, "tokens_per_second_per_gpu": 2376.17 }, { "epoch": 1.3031755542240862, "grad_norm": 3.140625, "learning_rate": 1.9034576702365192e-05, "loss": 0.3849, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13050, "tokens_per_second_per_gpu": 2561.57 }, { "epoch": 1.3041741561813462, "grad_norm": 3.140625, "learning_rate": 1.9026213887704562e-05, "loss": 0.3827, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13060, "tokens_per_second_per_gpu": 2259.45 }, { "epoch": 1.305172758138606, "grad_norm": 2.71875, "learning_rate": 1.9017847055386635e-05, "loss": 0.3458, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13070, "tokens_per_second_per_gpu": 2406.14 }, { "epoch": 1.3061713600958658, "grad_norm": 3.078125, "learning_rate": 1.9009476210562195e-05, "loss": 0.3796, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13080, "tokens_per_second_per_gpu": 2426.36 }, { "epoch": 1.3071699620531256, "grad_norm": 3.578125, "learning_rate": 1.900110135838448e-05, "loss": 0.3867, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13090, "tokens_per_second_per_gpu": 2493.91 }, { "epoch": 1.3081685640103855, "grad_norm": 2.890625, "learning_rate": 1.8992722504009214e-05, "loss": 0.3403, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13100, "tokens_per_second_per_gpu": 2402.72 }, { "epoch": 1.3091671659676454, "grad_norm": 3.46875, "learning_rate": 1.8984339652594563e-05, "loss": 0.3518, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13110, "tokens_per_second_per_gpu": 2309.9 }, { "epoch": 1.310165767924905, "grad_norm": 3.296875, "learning_rate": 1.8975952809301168e-05, "loss": 0.4081, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13120, "tokens_per_second_per_gpu": 2413.5 }, { "epoch": 1.311164369882165, "grad_norm": 2.8125, "learning_rate": 1.8967561979292122e-05, "loss": 0.3865, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13130, "tokens_per_second_per_gpu": 2584.72 }, { "epoch": 1.3121629718394248, "grad_norm": 3.421875, "learning_rate": 1.895916716773298e-05, "loss": 0.4143, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13140, "tokens_per_second_per_gpu": 2494.95 }, { "epoch": 1.3131615737966846, "grad_norm": 2.96875, "learning_rate": 1.8950768379791736e-05, "loss": 0.419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13150, "tokens_per_second_per_gpu": 2453.84 }, { "epoch": 1.3141601757539445, "grad_norm": 4.21875, "learning_rate": 1.8942365620638843e-05, "loss": 0.4526, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13160, "tokens_per_second_per_gpu": 2358.73 }, { "epoch": 1.3151587777112044, "grad_norm": 2.796875, "learning_rate": 1.8933958895447193e-05, "loss": 0.4362, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13170, "tokens_per_second_per_gpu": 2385.92 }, { "epoch": 1.3161573796684642, "grad_norm": 3.359375, "learning_rate": 1.892554820939212e-05, "loss": 0.3718, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13180, "tokens_per_second_per_gpu": 2513.5 }, { "epoch": 1.317155981625724, "grad_norm": 2.859375, "learning_rate": 1.8917133567651397e-05, "loss": 0.3493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13190, "tokens_per_second_per_gpu": 2461.19 }, { "epoch": 1.3181545835829838, "grad_norm": 2.484375, "learning_rate": 1.8908714975405228e-05, "loss": 0.3829, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13200, "tokens_per_second_per_gpu": 2549.21 }, { "epoch": 1.3191531855402436, "grad_norm": 3.15625, "learning_rate": 1.8900292437836258e-05, "loss": 0.4392, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13210, "tokens_per_second_per_gpu": 2528.05 }, { "epoch": 1.3201517874975035, "grad_norm": 2.890625, "learning_rate": 1.8891865960129558e-05, "loss": 0.408, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13220, "tokens_per_second_per_gpu": 2714.31 }, { "epoch": 1.3211503894547634, "grad_norm": 3.125, "learning_rate": 1.8883435547472622e-05, "loss": 0.3859, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13230, "tokens_per_second_per_gpu": 2416.22 }, { "epoch": 1.3221489914120232, "grad_norm": 3.203125, "learning_rate": 1.887500120505536e-05, "loss": 0.463, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13240, "tokens_per_second_per_gpu": 2612.78 }, { "epoch": 1.3231475933692831, "grad_norm": 2.71875, "learning_rate": 1.8866562938070113e-05, "loss": 0.3401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13250, "tokens_per_second_per_gpu": 2427.68 }, { "epoch": 1.3241461953265428, "grad_norm": 3.109375, "learning_rate": 1.885812075171164e-05, "loss": 0.417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13260, "tokens_per_second_per_gpu": 2325.72 }, { "epoch": 1.3251447972838026, "grad_norm": 2.671875, "learning_rate": 1.8849674651177092e-05, "loss": 0.3658, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13270, "tokens_per_second_per_gpu": 2566.4 }, { "epoch": 1.3261433992410625, "grad_norm": 1.96875, "learning_rate": 1.884122464166606e-05, "loss": 0.3704, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13280, "tokens_per_second_per_gpu": 2464.23 }, { "epoch": 1.3271420011983224, "grad_norm": 3.71875, "learning_rate": 1.8832770728380513e-05, "loss": 0.3615, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13290, "tokens_per_second_per_gpu": 2496.78 }, { "epoch": 1.3281406031555822, "grad_norm": 3.734375, "learning_rate": 1.882431291652484e-05, "loss": 0.3959, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13300, "tokens_per_second_per_gpu": 2630.52 }, { "epoch": 1.329139205112842, "grad_norm": 3.28125, "learning_rate": 1.8815851211305823e-05, "loss": 0.3786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13310, "tokens_per_second_per_gpu": 2566.44 }, { "epoch": 1.330137807070102, "grad_norm": 2.1875, "learning_rate": 1.8807385617932647e-05, "loss": 0.3581, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13320, "tokens_per_second_per_gpu": 2360.4 }, { "epoch": 1.3311364090273616, "grad_norm": 2.359375, "learning_rate": 1.8798916141616886e-05, "loss": 0.357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13330, "tokens_per_second_per_gpu": 2324.39 }, { "epoch": 1.3321350109846215, "grad_norm": 2.96875, "learning_rate": 1.8790442787572505e-05, "loss": 0.3895, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13340, "tokens_per_second_per_gpu": 2428.94 }, { "epoch": 1.3331336129418814, "grad_norm": 3.234375, "learning_rate": 1.8781965561015852e-05, "loss": 0.3629, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13350, "tokens_per_second_per_gpu": 2364.1 }, { "epoch": 1.3341322148991412, "grad_norm": 3.53125, "learning_rate": 1.8773484467165672e-05, "loss": 0.3493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13360, "tokens_per_second_per_gpu": 2496.25 }, { "epoch": 1.3351308168564011, "grad_norm": 3.046875, "learning_rate": 1.8764999511243078e-05, "loss": 0.3546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13370, "tokens_per_second_per_gpu": 2549.0 }, { "epoch": 1.3361294188136608, "grad_norm": 2.59375, "learning_rate": 1.8756510698471564e-05, "loss": 0.3576, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13380, "tokens_per_second_per_gpu": 2282.96 }, { "epoch": 1.3371280207709206, "grad_norm": 2.78125, "learning_rate": 1.8748018034077e-05, "loss": 0.4062, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13390, "tokens_per_second_per_gpu": 2567.96 }, { "epoch": 1.3381266227281805, "grad_norm": 2.90625, "learning_rate": 1.8739521523287627e-05, "loss": 0.3876, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13400, "tokens_per_second_per_gpu": 2516.37 }, { "epoch": 1.3391252246854404, "grad_norm": 2.84375, "learning_rate": 1.8731021171334055e-05, "loss": 0.3638, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13410, "tokens_per_second_per_gpu": 2536.77 }, { "epoch": 1.3401238266427002, "grad_norm": 3.953125, "learning_rate": 1.8722516983449253e-05, "loss": 0.4169, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13420, "tokens_per_second_per_gpu": 2340.84 }, { "epoch": 1.3411224285999601, "grad_norm": 4.09375, "learning_rate": 1.871400896486856e-05, "loss": 0.407, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13430, "tokens_per_second_per_gpu": 2466.11 }, { "epoch": 1.34212103055722, "grad_norm": 3.09375, "learning_rate": 1.8705497120829664e-05, "loss": 0.4053, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13440, "tokens_per_second_per_gpu": 2478.02 }, { "epoch": 1.3431196325144796, "grad_norm": 2.984375, "learning_rate": 1.8696981456572614e-05, "loss": 0.3844, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13450, "tokens_per_second_per_gpu": 2316.19 }, { "epoch": 1.3441182344717395, "grad_norm": 2.28125, "learning_rate": 1.8688461977339814e-05, "loss": 0.3267, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13460, "tokens_per_second_per_gpu": 2569.43 }, { "epoch": 1.3451168364289994, "grad_norm": 3.1875, "learning_rate": 1.8679938688376003e-05, "loss": 0.3688, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13470, "tokens_per_second_per_gpu": 2459.37 }, { "epoch": 1.3461154383862592, "grad_norm": 3.1875, "learning_rate": 1.8671411594928287e-05, "loss": 0.4114, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13480, "tokens_per_second_per_gpu": 2405.61 }, { "epoch": 1.3471140403435191, "grad_norm": 3.328125, "learning_rate": 1.866288070224609e-05, "loss": 0.3695, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13490, "tokens_per_second_per_gpu": 2508.66 }, { "epoch": 1.348112642300779, "grad_norm": 2.9375, "learning_rate": 1.8654346015581193e-05, "loss": 0.4449, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13500, "tokens_per_second_per_gpu": 2392.62 }, { "epoch": 1.3491112442580389, "grad_norm": 3.125, "learning_rate": 1.8645807540187703e-05, "loss": 0.3977, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13510, "tokens_per_second_per_gpu": 2739.47 }, { "epoch": 1.3501098462152985, "grad_norm": 2.859375, "learning_rate": 1.8637265281322064e-05, "loss": 0.3644, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13520, "tokens_per_second_per_gpu": 2443.57 }, { "epoch": 1.3511084481725584, "grad_norm": 3.59375, "learning_rate": 1.8628719244243047e-05, "loss": 0.384, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13530, "tokens_per_second_per_gpu": 2350.35 }, { "epoch": 1.3521070501298182, "grad_norm": 3.484375, "learning_rate": 1.8620169434211752e-05, "loss": 0.3938, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13540, "tokens_per_second_per_gpu": 2295.45 }, { "epoch": 1.3531056520870781, "grad_norm": 3.484375, "learning_rate": 1.8611615856491592e-05, "loss": 0.3747, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13550, "tokens_per_second_per_gpu": 2464.54 }, { "epoch": 1.354104254044338, "grad_norm": 3.578125, "learning_rate": 1.8603058516348317e-05, "loss": 0.3998, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13560, "tokens_per_second_per_gpu": 2509.76 }, { "epoch": 1.3551028560015976, "grad_norm": 3.203125, "learning_rate": 1.859449741904998e-05, "loss": 0.3876, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13570, "tokens_per_second_per_gpu": 2473.81 }, { "epoch": 1.3561014579588577, "grad_norm": 2.59375, "learning_rate": 1.858593256986694e-05, "loss": 0.3959, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13580, "tokens_per_second_per_gpu": 2517.66 }, { "epoch": 1.3571000599161174, "grad_norm": 3.125, "learning_rate": 1.8577363974071885e-05, "loss": 0.419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13590, "tokens_per_second_per_gpu": 2492.45 }, { "epoch": 1.3580986618733772, "grad_norm": 2.59375, "learning_rate": 1.8568791636939804e-05, "loss": 0.3713, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13600, "tokens_per_second_per_gpu": 2336.32 }, { "epoch": 1.3590972638306371, "grad_norm": 2.484375, "learning_rate": 1.8560215563747972e-05, "loss": 0.37, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13610, "tokens_per_second_per_gpu": 2402.96 }, { "epoch": 1.360095865787897, "grad_norm": 2.859375, "learning_rate": 1.8551635759775993e-05, "loss": 0.3525, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13620, "tokens_per_second_per_gpu": 2539.34 }, { "epoch": 1.3610944677451569, "grad_norm": 3.390625, "learning_rate": 1.8543052230305745e-05, "loss": 0.3836, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13630, "tokens_per_second_per_gpu": 2535.64 }, { "epoch": 1.3620930697024165, "grad_norm": 3.265625, "learning_rate": 1.85344649806214e-05, "loss": 0.3954, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13640, "tokens_per_second_per_gpu": 2637.16 }, { "epoch": 1.3630916716596764, "grad_norm": 3.125, "learning_rate": 1.8525874016009438e-05, "loss": 0.4344, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13650, "tokens_per_second_per_gpu": 2720.16 }, { "epoch": 1.3640902736169362, "grad_norm": 3.21875, "learning_rate": 1.8517279341758603e-05, "loss": 0.3406, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13660, "tokens_per_second_per_gpu": 2368.04 }, { "epoch": 1.3650888755741961, "grad_norm": 3.890625, "learning_rate": 1.8508680963159948e-05, "loss": 0.357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13670, "tokens_per_second_per_gpu": 2415.23 }, { "epoch": 1.366087477531456, "grad_norm": 3.921875, "learning_rate": 1.8500078885506782e-05, "loss": 0.3915, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13680, "tokens_per_second_per_gpu": 2376.95 }, { "epoch": 1.3670860794887159, "grad_norm": 3.0625, "learning_rate": 1.849147311409471e-05, "loss": 0.3563, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13690, "tokens_per_second_per_gpu": 2459.44 }, { "epoch": 1.3680846814459757, "grad_norm": 2.96875, "learning_rate": 1.84828636542216e-05, "loss": 0.4365, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13700, "tokens_per_second_per_gpu": 2522.2 }, { "epoch": 1.3690832834032354, "grad_norm": 2.890625, "learning_rate": 1.8474250511187592e-05, "loss": 0.3843, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13710, "tokens_per_second_per_gpu": 2474.81 }, { "epoch": 1.3700818853604952, "grad_norm": 3.015625, "learning_rate": 1.8465633690295096e-05, "loss": 0.3662, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13720, "tokens_per_second_per_gpu": 2656.01 }, { "epoch": 1.3710804873177551, "grad_norm": 3.4375, "learning_rate": 1.8457013196848785e-05, "loss": 0.3665, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13730, "tokens_per_second_per_gpu": 2469.85 }, { "epoch": 1.372079089275015, "grad_norm": 3.1875, "learning_rate": 1.8448389036155596e-05, "loss": 0.3912, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13740, "tokens_per_second_per_gpu": 2441.76 }, { "epoch": 1.3730776912322749, "grad_norm": 2.703125, "learning_rate": 1.843976121352472e-05, "loss": 0.3973, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13750, "tokens_per_second_per_gpu": 2395.5 }, { "epoch": 1.3740762931895347, "grad_norm": 2.46875, "learning_rate": 1.8431129734267603e-05, "loss": 0.3535, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13760, "tokens_per_second_per_gpu": 2453.36 }, { "epoch": 1.3750748951467946, "grad_norm": 3.359375, "learning_rate": 1.8422494603697944e-05, "loss": 0.3434, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13770, "tokens_per_second_per_gpu": 2432.84 }, { "epoch": 1.3760734971040542, "grad_norm": 3.34375, "learning_rate": 1.8413855827131678e-05, "loss": 0.3667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13780, "tokens_per_second_per_gpu": 2322.23 }, { "epoch": 1.3770720990613141, "grad_norm": 3.328125, "learning_rate": 1.840521340988701e-05, "loss": 0.4192, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13790, "tokens_per_second_per_gpu": 2480.02 }, { "epoch": 1.378070701018574, "grad_norm": 3.0, "learning_rate": 1.839656735728436e-05, "loss": 0.3488, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13800, "tokens_per_second_per_gpu": 2582.3 }, { "epoch": 1.3790693029758339, "grad_norm": 3.46875, "learning_rate": 1.8387917674646408e-05, "loss": 0.4193, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13810, "tokens_per_second_per_gpu": 2353.88 }, { "epoch": 1.3800679049330937, "grad_norm": 3.0625, "learning_rate": 1.8379264367298043e-05, "loss": 0.3917, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13820, "tokens_per_second_per_gpu": 2423.44 }, { "epoch": 1.3810665068903534, "grad_norm": 3.109375, "learning_rate": 1.8370607440566412e-05, "loss": 0.3822, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13830, "tokens_per_second_per_gpu": 2481.1 }, { "epoch": 1.3820651088476135, "grad_norm": 2.609375, "learning_rate": 1.8361946899780874e-05, "loss": 0.3541, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13840, "tokens_per_second_per_gpu": 2494.78 }, { "epoch": 1.3830637108048731, "grad_norm": 2.84375, "learning_rate": 1.8353282750273016e-05, "loss": 0.3726, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13850, "tokens_per_second_per_gpu": 2584.91 }, { "epoch": 1.384062312762133, "grad_norm": 5.0, "learning_rate": 1.8344614997376646e-05, "loss": 0.4346, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13860, "tokens_per_second_per_gpu": 2364.65 }, { "epoch": 1.3850609147193929, "grad_norm": 4.09375, "learning_rate": 1.8335943646427803e-05, "loss": 0.3944, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13870, "tokens_per_second_per_gpu": 2473.15 }, { "epoch": 1.3860595166766527, "grad_norm": 3.46875, "learning_rate": 1.832726870276472e-05, "loss": 0.3994, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13880, "tokens_per_second_per_gpu": 2438.66 }, { "epoch": 1.3870581186339126, "grad_norm": 3.53125, "learning_rate": 1.8318590171727846e-05, "loss": 0.4132, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13890, "tokens_per_second_per_gpu": 2510.74 }, { "epoch": 1.3880567205911722, "grad_norm": 2.859375, "learning_rate": 1.8309908058659858e-05, "loss": 0.3826, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13900, "tokens_per_second_per_gpu": 2403.19 }, { "epoch": 1.3890553225484321, "grad_norm": 3.671875, "learning_rate": 1.8301222368905624e-05, "loss": 0.3764, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13910, "tokens_per_second_per_gpu": 2565.64 }, { "epoch": 1.390053924505692, "grad_norm": 3.0625, "learning_rate": 1.8292533107812202e-05, "loss": 0.365, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13920, "tokens_per_second_per_gpu": 2447.79 }, { "epoch": 1.3910525264629519, "grad_norm": 3.140625, "learning_rate": 1.828384028072887e-05, "loss": 0.3508, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13930, "tokens_per_second_per_gpu": 2596.01 }, { "epoch": 1.3920511284202117, "grad_norm": 3.15625, "learning_rate": 1.8275143893007092e-05, "loss": 0.3851, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13940, "tokens_per_second_per_gpu": 2685.4 }, { "epoch": 1.3930497303774716, "grad_norm": 4.0, "learning_rate": 1.826644395000052e-05, "loss": 0.3702, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13950, "tokens_per_second_per_gpu": 2200.93 }, { "epoch": 1.3940483323347315, "grad_norm": 3.375, "learning_rate": 1.8257740457065005e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13960, "tokens_per_second_per_gpu": 2421.89 }, { "epoch": 1.3950469342919911, "grad_norm": 3.15625, "learning_rate": 1.824903341955857e-05, "loss": 0.4062, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13970, "tokens_per_second_per_gpu": 2428.18 }, { "epoch": 1.396045536249251, "grad_norm": 3.46875, "learning_rate": 1.8240322842841438e-05, "loss": 0.4059, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13980, "tokens_per_second_per_gpu": 2494.14 }, { "epoch": 1.3970441382065109, "grad_norm": 3.015625, "learning_rate": 1.8231608732275987e-05, "loss": 0.3774, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 13990, "tokens_per_second_per_gpu": 2608.97 }, { "epoch": 1.3980427401637707, "grad_norm": 3.140625, "learning_rate": 1.8222891093226794e-05, "loss": 0.3585, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14000, "tokens_per_second_per_gpu": 2368.11 }, { "epoch": 1.3990413421210306, "grad_norm": 3.0625, "learning_rate": 1.8214169931060592e-05, "loss": 0.3372, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14010, "tokens_per_second_per_gpu": 2559.87 }, { "epoch": 1.4000399440782905, "grad_norm": 3.34375, "learning_rate": 1.8205445251146292e-05, "loss": 0.3774, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14020, "tokens_per_second_per_gpu": 2380.15 }, { "epoch": 1.4010385460355503, "grad_norm": 3.359375, "learning_rate": 1.8196717058854968e-05, "loss": 0.4067, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14030, "tokens_per_second_per_gpu": 2424.4 }, { "epoch": 1.40203714799281, "grad_norm": 3.453125, "learning_rate": 1.818798535955985e-05, "loss": 0.388, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14040, "tokens_per_second_per_gpu": 2526.65 }, { "epoch": 1.4030357499500699, "grad_norm": 2.421875, "learning_rate": 1.817925015863634e-05, "loss": 0.3609, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14050, "tokens_per_second_per_gpu": 2341.9 }, { "epoch": 1.4040343519073297, "grad_norm": 3.796875, "learning_rate": 1.8170511461461983e-05, "loss": 0.3618, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14060, "tokens_per_second_per_gpu": 2271.31 }, { "epoch": 1.4050329538645896, "grad_norm": 2.625, "learning_rate": 1.8161769273416487e-05, "loss": 0.3987, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14070, "tokens_per_second_per_gpu": 2431.4 }, { "epoch": 1.4060315558218495, "grad_norm": 2.578125, "learning_rate": 1.8153023599881696e-05, "loss": 0.3864, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14080, "tokens_per_second_per_gpu": 2401.04 }, { "epoch": 1.4070301577791091, "grad_norm": 4.3125, "learning_rate": 1.8144274446241616e-05, "loss": 0.3559, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14090, "tokens_per_second_per_gpu": 2417.15 }, { "epoch": 1.4080287597363692, "grad_norm": 3.0, "learning_rate": 1.813552181788238e-05, "loss": 0.3558, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14100, "tokens_per_second_per_gpu": 2652.09 }, { "epoch": 1.4090273616936289, "grad_norm": 3.4375, "learning_rate": 1.8126765720192272e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14110, "tokens_per_second_per_gpu": 2313.18 }, { "epoch": 1.4100259636508887, "grad_norm": 4.40625, "learning_rate": 1.8118006158561703e-05, "loss": 0.3818, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14120, "tokens_per_second_per_gpu": 2435.47 }, { "epoch": 1.4110245656081486, "grad_norm": 4.9375, "learning_rate": 1.8109243138383225e-05, "loss": 0.4159, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14130, "tokens_per_second_per_gpu": 2407.77 }, { "epoch": 1.4120231675654085, "grad_norm": 2.296875, "learning_rate": 1.810047666505151e-05, "loss": 0.3539, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14140, "tokens_per_second_per_gpu": 2324.34 }, { "epoch": 1.4130217695226683, "grad_norm": 3.75, "learning_rate": 1.809170674396336e-05, "loss": 0.4174, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14150, "tokens_per_second_per_gpu": 2540.51 }, { "epoch": 1.414020371479928, "grad_norm": 3.328125, "learning_rate": 1.8082933380517703e-05, "loss": 0.3493, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14160, "tokens_per_second_per_gpu": 2597.51 }, { "epoch": 1.4150189734371879, "grad_norm": 4.25, "learning_rate": 1.807415658011558e-05, "loss": 0.3734, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14170, "tokens_per_second_per_gpu": 2397.17 }, { "epoch": 1.4160175753944477, "grad_norm": 2.734375, "learning_rate": 1.8065376348160153e-05, "loss": 0.3544, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14180, "tokens_per_second_per_gpu": 2422.33 }, { "epoch": 1.4170161773517076, "grad_norm": 3.265625, "learning_rate": 1.805659269005669e-05, "loss": 0.3975, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14190, "tokens_per_second_per_gpu": 2255.13 }, { "epoch": 1.4180147793089675, "grad_norm": 4.53125, "learning_rate": 1.8047805611212582e-05, "loss": 0.4127, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14200, "tokens_per_second_per_gpu": 2535.8 }, { "epoch": 1.4190133812662273, "grad_norm": 2.953125, "learning_rate": 1.8039015117037303e-05, "loss": 0.3867, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14210, "tokens_per_second_per_gpu": 2256.29 }, { "epoch": 1.4200119832234872, "grad_norm": 3.125, "learning_rate": 1.803022121294245e-05, "loss": 0.3997, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14220, "tokens_per_second_per_gpu": 2385.86 }, { "epoch": 1.4210105851807469, "grad_norm": 2.28125, "learning_rate": 1.8021423904341715e-05, "loss": 0.3652, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14230, "tokens_per_second_per_gpu": 2463.36 }, { "epoch": 1.4220091871380067, "grad_norm": 3.140625, "learning_rate": 1.801262319665088e-05, "loss": 0.3556, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14240, "tokens_per_second_per_gpu": 2540.35 }, { "epoch": 1.4230077890952666, "grad_norm": 3.265625, "learning_rate": 1.8003819095287822e-05, "loss": 0.3672, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14250, "tokens_per_second_per_gpu": 2494.33 }, { "epoch": 1.4240063910525265, "grad_norm": 2.921875, "learning_rate": 1.7995011605672508e-05, "loss": 0.3582, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14260, "tokens_per_second_per_gpu": 2249.36 }, { "epoch": 1.4250049930097863, "grad_norm": 4.46875, "learning_rate": 1.7986200733226987e-05, "loss": 0.3847, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14270, "tokens_per_second_per_gpu": 2399.15 }, { "epoch": 1.4260035949670462, "grad_norm": 4.5625, "learning_rate": 1.79773864833754e-05, "loss": 0.4352, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14280, "tokens_per_second_per_gpu": 2500.82 }, { "epoch": 1.427002196924306, "grad_norm": 3.3125, "learning_rate": 1.7968568861543962e-05, "loss": 0.3915, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14290, "tokens_per_second_per_gpu": 2629.08 }, { "epoch": 1.4280007988815657, "grad_norm": 2.796875, "learning_rate": 1.7959747873160958e-05, "loss": 0.3713, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14300, "tokens_per_second_per_gpu": 2215.62 }, { "epoch": 1.4289994008388256, "grad_norm": 3.140625, "learning_rate": 1.7950923523656755e-05, "loss": 0.3707, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14310, "tokens_per_second_per_gpu": 2543.73 }, { "epoch": 1.4299980027960855, "grad_norm": 2.8125, "learning_rate": 1.794209581846378e-05, "loss": 0.3954, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14320, "tokens_per_second_per_gpu": 2530.03 }, { "epoch": 1.4309966047533453, "grad_norm": 3.0, "learning_rate": 1.7933264763016537e-05, "loss": 0.4088, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14330, "tokens_per_second_per_gpu": 2527.04 }, { "epoch": 1.4319952067106052, "grad_norm": 3.421875, "learning_rate": 1.7924430362751583e-05, "loss": 0.3975, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14340, "tokens_per_second_per_gpu": 2648.37 }, { "epoch": 1.4329938086678649, "grad_norm": 2.671875, "learning_rate": 1.791559262310753e-05, "loss": 0.346, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14350, "tokens_per_second_per_gpu": 2561.03 }, { "epoch": 1.433992410625125, "grad_norm": 3.375, "learning_rate": 1.7906751549525064e-05, "loss": 0.3716, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14360, "tokens_per_second_per_gpu": 2398.54 }, { "epoch": 1.4349910125823846, "grad_norm": 3.046875, "learning_rate": 1.7897907147446907e-05, "loss": 0.421, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14370, "tokens_per_second_per_gpu": 2465.9 }, { "epoch": 1.4359896145396445, "grad_norm": 2.328125, "learning_rate": 1.788905942231784e-05, "loss": 0.3397, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14380, "tokens_per_second_per_gpu": 2608.94 }, { "epoch": 1.4369882164969043, "grad_norm": 3.203125, "learning_rate": 1.788020837958468e-05, "loss": 0.385, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14390, "tokens_per_second_per_gpu": 2346.23 }, { "epoch": 1.4379868184541642, "grad_norm": 2.84375, "learning_rate": 1.7871354024696296e-05, "loss": 0.4135, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14400, "tokens_per_second_per_gpu": 2606.42 }, { "epoch": 1.438985420411424, "grad_norm": 3.484375, "learning_rate": 1.7862496363103586e-05, "loss": 0.4197, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14410, "tokens_per_second_per_gpu": 2377.24 }, { "epoch": 1.4399840223686837, "grad_norm": 3.90625, "learning_rate": 1.7853635400259494e-05, "loss": 0.4108, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14420, "tokens_per_second_per_gpu": 2328.55 }, { "epoch": 1.4409826243259438, "grad_norm": 3.75, "learning_rate": 1.7844771141618996e-05, "loss": 0.4176, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14430, "tokens_per_second_per_gpu": 2507.0 }, { "epoch": 1.4419812262832035, "grad_norm": 4.0625, "learning_rate": 1.783590359263908e-05, "loss": 0.4028, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14440, "tokens_per_second_per_gpu": 2356.69 }, { "epoch": 1.4429798282404633, "grad_norm": 2.59375, "learning_rate": 1.7827032758778787e-05, "loss": 0.342, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14450, "tokens_per_second_per_gpu": 2458.58 }, { "epoch": 1.4439784301977232, "grad_norm": 3.625, "learning_rate": 1.7818158645499157e-05, "loss": 0.3736, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14460, "tokens_per_second_per_gpu": 2326.45 }, { "epoch": 1.444977032154983, "grad_norm": 2.28125, "learning_rate": 1.7809281258263253e-05, "loss": 0.3273, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14470, "tokens_per_second_per_gpu": 2542.3 }, { "epoch": 1.445975634112243, "grad_norm": 3.21875, "learning_rate": 1.7800400602536165e-05, "loss": 0.4091, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14480, "tokens_per_second_per_gpu": 2391.44 }, { "epoch": 1.4469742360695026, "grad_norm": 3.90625, "learning_rate": 1.7791516683784986e-05, "loss": 0.4127, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14490, "tokens_per_second_per_gpu": 2525.83 }, { "epoch": 1.4479728380267625, "grad_norm": 3.203125, "learning_rate": 1.7782629507478822e-05, "loss": 0.3897, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14500, "tokens_per_second_per_gpu": 2438.37 }, { "epoch": 1.4489714399840223, "grad_norm": 3.328125, "learning_rate": 1.777373907908878e-05, "loss": 0.3746, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14510, "tokens_per_second_per_gpu": 2547.23 }, { "epoch": 1.4499700419412822, "grad_norm": 4.09375, "learning_rate": 1.7764845404087973e-05, "loss": 0.4196, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14520, "tokens_per_second_per_gpu": 2428.02 }, { "epoch": 1.450968643898542, "grad_norm": 3.421875, "learning_rate": 1.775594848795151e-05, "loss": 0.3911, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14530, "tokens_per_second_per_gpu": 2559.47 }, { "epoch": 1.451967245855802, "grad_norm": 2.75, "learning_rate": 1.774704833615649e-05, "loss": 0.3852, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14540, "tokens_per_second_per_gpu": 2346.01 }, { "epoch": 1.4529658478130618, "grad_norm": 2.234375, "learning_rate": 1.7738144954182018e-05, "loss": 0.4694, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14550, "tokens_per_second_per_gpu": 2335.62 }, { "epoch": 1.4539644497703215, "grad_norm": 4.34375, "learning_rate": 1.772923834750918e-05, "loss": 0.4106, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14560, "tokens_per_second_per_gpu": 2558.92 }, { "epoch": 1.4549630517275813, "grad_norm": 2.84375, "learning_rate": 1.7720328521621043e-05, "loss": 0.377, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14570, "tokens_per_second_per_gpu": 2495.57 }, { "epoch": 1.4559616536848412, "grad_norm": 3.125, "learning_rate": 1.771141548200266e-05, "loss": 0.4332, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14580, "tokens_per_second_per_gpu": 2532.99 }, { "epoch": 1.456960255642101, "grad_norm": 3.46875, "learning_rate": 1.770249923414106e-05, "loss": 0.3615, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14590, "tokens_per_second_per_gpu": 2375.0 }, { "epoch": 1.457958857599361, "grad_norm": 3.625, "learning_rate": 1.769357978352526e-05, "loss": 0.417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14600, "tokens_per_second_per_gpu": 2435.03 }, { "epoch": 1.4589574595566206, "grad_norm": 3.328125, "learning_rate": 1.7684657135646224e-05, "loss": 0.3695, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14610, "tokens_per_second_per_gpu": 2424.58 }, { "epoch": 1.4599560615138807, "grad_norm": 3.59375, "learning_rate": 1.7675731295996906e-05, "loss": 0.3703, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14620, "tokens_per_second_per_gpu": 2549.49 }, { "epoch": 1.4609546634711403, "grad_norm": 3.109375, "learning_rate": 1.7666802270072214e-05, "loss": 0.3275, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14630, "tokens_per_second_per_gpu": 2504.69 }, { "epoch": 1.4619532654284002, "grad_norm": 3.828125, "learning_rate": 1.7657870063369025e-05, "loss": 0.3766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14640, "tokens_per_second_per_gpu": 2531.29 }, { "epoch": 1.46295186738566, "grad_norm": 3.375, "learning_rate": 1.764893468138616e-05, "loss": 0.3736, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14650, "tokens_per_second_per_gpu": 2378.99 }, { "epoch": 1.46395046934292, "grad_norm": 3.40625, "learning_rate": 1.763999612962442e-05, "loss": 0.3818, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14660, "tokens_per_second_per_gpu": 2599.83 }, { "epoch": 1.4649490713001798, "grad_norm": 3.453125, "learning_rate": 1.7631054413586526e-05, "loss": 0.399, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14670, "tokens_per_second_per_gpu": 2503.43 }, { "epoch": 1.4659476732574395, "grad_norm": 2.984375, "learning_rate": 1.762210953877717e-05, "loss": 0.4223, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14680, "tokens_per_second_per_gpu": 2326.02 }, { "epoch": 1.4669462752146996, "grad_norm": 3.546875, "learning_rate": 1.7613161510702984e-05, "loss": 0.3965, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14690, "tokens_per_second_per_gpu": 2438.41 }, { "epoch": 1.4679448771719592, "grad_norm": 3.390625, "learning_rate": 1.7604210334872535e-05, "loss": 0.387, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14700, "tokens_per_second_per_gpu": 2452.81 }, { "epoch": 1.468943479129219, "grad_norm": 3.859375, "learning_rate": 1.759525601679633e-05, "loss": 0.3823, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14710, "tokens_per_second_per_gpu": 2365.33 }, { "epoch": 1.469942081086479, "grad_norm": 3.59375, "learning_rate": 1.758629856198682e-05, "loss": 0.3737, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14720, "tokens_per_second_per_gpu": 2322.62 }, { "epoch": 1.4709406830437388, "grad_norm": 3.015625, "learning_rate": 1.7577337975958365e-05, "loss": 0.3295, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14730, "tokens_per_second_per_gpu": 2382.1 }, { "epoch": 1.4719392850009987, "grad_norm": 3.125, "learning_rate": 1.7568374264227278e-05, "loss": 0.4049, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14740, "tokens_per_second_per_gpu": 2532.38 }, { "epoch": 1.4729378869582583, "grad_norm": 3.390625, "learning_rate": 1.755940743231178e-05, "loss": 0.3773, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14750, "tokens_per_second_per_gpu": 2325.55 }, { "epoch": 1.4739364889155182, "grad_norm": 3.21875, "learning_rate": 1.755043748573202e-05, "loss": 0.3936, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14760, "tokens_per_second_per_gpu": 2339.45 }, { "epoch": 1.474935090872778, "grad_norm": 3.34375, "learning_rate": 1.7541464430010065e-05, "loss": 0.347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14770, "tokens_per_second_per_gpu": 2491.14 }, { "epoch": 1.475933692830038, "grad_norm": 3.71875, "learning_rate": 1.7532488270669886e-05, "loss": 0.4098, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14780, "tokens_per_second_per_gpu": 2424.67 }, { "epoch": 1.4769322947872978, "grad_norm": 3.625, "learning_rate": 1.7523509013237374e-05, "loss": 0.376, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14790, "tokens_per_second_per_gpu": 2711.66 }, { "epoch": 1.4779308967445577, "grad_norm": 3.328125, "learning_rate": 1.7514526663240327e-05, "loss": 0.3495, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14800, "tokens_per_second_per_gpu": 2447.48 }, { "epoch": 1.4789294987018176, "grad_norm": 3.625, "learning_rate": 1.7505541226208445e-05, "loss": 0.3946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14810, "tokens_per_second_per_gpu": 2500.56 }, { "epoch": 1.4799281006590772, "grad_norm": 4.21875, "learning_rate": 1.7496552707673323e-05, "loss": 0.3895, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14820, "tokens_per_second_per_gpu": 2558.36 }, { "epoch": 1.480926702616337, "grad_norm": 4.21875, "learning_rate": 1.7487561113168465e-05, "loss": 0.4024, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14830, "tokens_per_second_per_gpu": 2349.8 }, { "epoch": 1.481925304573597, "grad_norm": 3.421875, "learning_rate": 1.7478566448229262e-05, "loss": 0.3787, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14840, "tokens_per_second_per_gpu": 2515.28 }, { "epoch": 1.4829239065308568, "grad_norm": 3.53125, "learning_rate": 1.746956871839299e-05, "loss": 0.401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14850, "tokens_per_second_per_gpu": 2328.94 }, { "epoch": 1.4839225084881167, "grad_norm": 3.15625, "learning_rate": 1.746056792919882e-05, "loss": 0.4052, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14860, "tokens_per_second_per_gpu": 2512.94 }, { "epoch": 1.4849211104453763, "grad_norm": 3.453125, "learning_rate": 1.7451564086187804e-05, "loss": 0.4168, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14870, "tokens_per_second_per_gpu": 2592.81 }, { "epoch": 1.4859197124026364, "grad_norm": 4.03125, "learning_rate": 1.7442557194902868e-05, "loss": 0.4032, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14880, "tokens_per_second_per_gpu": 2505.58 }, { "epoch": 1.486918314359896, "grad_norm": 3.625, "learning_rate": 1.7433547260888828e-05, "loss": 0.3497, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14890, "tokens_per_second_per_gpu": 2436.34 }, { "epoch": 1.487916916317156, "grad_norm": 3.1875, "learning_rate": 1.742453428969236e-05, "loss": 0.4103, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14900, "tokens_per_second_per_gpu": 2397.9 }, { "epoch": 1.4889155182744158, "grad_norm": 3.984375, "learning_rate": 1.7415518286862018e-05, "loss": 0.4364, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14910, "tokens_per_second_per_gpu": 2369.47 }, { "epoch": 1.4899141202316757, "grad_norm": 2.734375, "learning_rate": 1.7406499257948218e-05, "loss": 0.3598, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14920, "tokens_per_second_per_gpu": 2579.24 }, { "epoch": 1.4909127221889356, "grad_norm": 2.46875, "learning_rate": 1.7397477208503233e-05, "loss": 0.4074, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14930, "tokens_per_second_per_gpu": 2455.61 }, { "epoch": 1.4919113241461952, "grad_norm": 3.453125, "learning_rate": 1.738845214408121e-05, "loss": 0.3815, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14940, "tokens_per_second_per_gpu": 2273.67 }, { "epoch": 1.4929099261034553, "grad_norm": 2.515625, "learning_rate": 1.7379424070238148e-05, "loss": 0.3593, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14950, "tokens_per_second_per_gpu": 2515.66 }, { "epoch": 1.493908528060715, "grad_norm": 3.21875, "learning_rate": 1.7370392992531893e-05, "loss": 0.44, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14960, "tokens_per_second_per_gpu": 2592.23 }, { "epoch": 1.4949071300179748, "grad_norm": 2.953125, "learning_rate": 1.7361358916522132e-05, "loss": 0.3682, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14970, "tokens_per_second_per_gpu": 2421.75 }, { "epoch": 1.4959057319752347, "grad_norm": 3.53125, "learning_rate": 1.7352321847770422e-05, "loss": 0.3788, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14980, "tokens_per_second_per_gpu": 2491.22 }, { "epoch": 1.4969043339324946, "grad_norm": 2.96875, "learning_rate": 1.734328179184014e-05, "loss": 0.3887, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 14990, "tokens_per_second_per_gpu": 2465.86 }, { "epoch": 1.4979029358897544, "grad_norm": 3.484375, "learning_rate": 1.7334238754296515e-05, "loss": 0.3824, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15000, "tokens_per_second_per_gpu": 2508.14 }, { "epoch": 1.498901537847014, "grad_norm": 3.96875, "learning_rate": 1.732519274070661e-05, "loss": 0.3574, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15010, "tokens_per_second_per_gpu": 2321.71 }, { "epoch": 1.499900139804274, "grad_norm": 3.234375, "learning_rate": 1.7316143756639308e-05, "loss": 0.3824, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15020, "tokens_per_second_per_gpu": 2506.04 }, { "epoch": 1.5008987417615338, "grad_norm": 3.15625, "learning_rate": 1.730709180766534e-05, "loss": 0.4269, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15030, "tokens_per_second_per_gpu": 2398.83 }, { "epoch": 1.5018973437187937, "grad_norm": 3.953125, "learning_rate": 1.7298036899357247e-05, "loss": 0.3515, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15040, "tokens_per_second_per_gpu": 2436.11 }, { "epoch": 1.5028959456760536, "grad_norm": 2.59375, "learning_rate": 1.7288979037289398e-05, "loss": 0.3068, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15050, "tokens_per_second_per_gpu": 2531.2 }, { "epoch": 1.5038945476333132, "grad_norm": 3.25, "learning_rate": 1.727991822703798e-05, "loss": 0.393, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15060, "tokens_per_second_per_gpu": 2419.24 }, { "epoch": 1.5048931495905733, "grad_norm": 3.734375, "learning_rate": 1.7270854474180992e-05, "loss": 0.3523, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15070, "tokens_per_second_per_gpu": 2188.69 }, { "epoch": 1.505891751547833, "grad_norm": 3.578125, "learning_rate": 1.7261787784298247e-05, "loss": 0.3763, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15080, "tokens_per_second_per_gpu": 2557.32 }, { "epoch": 1.5068903535050928, "grad_norm": 3.265625, "learning_rate": 1.725271816297137e-05, "loss": 0.4171, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15090, "tokens_per_second_per_gpu": 2519.01 }, { "epoch": 1.5078889554623527, "grad_norm": 3.734375, "learning_rate": 1.7243645615783786e-05, "loss": 0.4131, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15100, "tokens_per_second_per_gpu": 2335.78 }, { "epoch": 1.5088875574196126, "grad_norm": 3.4375, "learning_rate": 1.7234570148320718e-05, "loss": 0.4079, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15110, "tokens_per_second_per_gpu": 2540.83 }, { "epoch": 1.5098861593768724, "grad_norm": 3.890625, "learning_rate": 1.72254917661692e-05, "loss": 0.4316, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15120, "tokens_per_second_per_gpu": 2702.46 }, { "epoch": 1.510884761334132, "grad_norm": 3.8125, "learning_rate": 1.7216410474918035e-05, "loss": 0.3961, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15130, "tokens_per_second_per_gpu": 2403.28 }, { "epoch": 1.5118833632913922, "grad_norm": 3.6875, "learning_rate": 1.7207326280157843e-05, "loss": 0.4156, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15140, "tokens_per_second_per_gpu": 2541.55 }, { "epoch": 1.5128819652486518, "grad_norm": 3.921875, "learning_rate": 1.7198239187481023e-05, "loss": 0.4052, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15150, "tokens_per_second_per_gpu": 2336.49 }, { "epoch": 1.5138805672059117, "grad_norm": 2.8125, "learning_rate": 1.7189149202481757e-05, "loss": 0.3502, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15160, "tokens_per_second_per_gpu": 2466.51 }, { "epoch": 1.5148791691631716, "grad_norm": 4.34375, "learning_rate": 1.7180056330756e-05, "loss": 0.4123, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15170, "tokens_per_second_per_gpu": 2307.65 }, { "epoch": 1.5158777711204314, "grad_norm": 3.359375, "learning_rate": 1.7170960577901496e-05, "loss": 0.3497, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15180, "tokens_per_second_per_gpu": 2549.81 }, { "epoch": 1.5168763730776913, "grad_norm": 2.890625, "learning_rate": 1.7161861949517753e-05, "loss": 0.3792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15190, "tokens_per_second_per_gpu": 2513.9 }, { "epoch": 1.517874975034951, "grad_norm": 3.0625, "learning_rate": 1.715276045120606e-05, "loss": 0.3534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15200, "tokens_per_second_per_gpu": 2403.09 }, { "epoch": 1.518873576992211, "grad_norm": 3.078125, "learning_rate": 1.7143656088569464e-05, "loss": 0.3773, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15210, "tokens_per_second_per_gpu": 2409.16 }, { "epoch": 1.5198721789494707, "grad_norm": 3.796875, "learning_rate": 1.7134548867212784e-05, "loss": 0.3718, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15220, "tokens_per_second_per_gpu": 2386.56 }, { "epoch": 1.5208707809067306, "grad_norm": 2.828125, "learning_rate": 1.7125438792742578e-05, "loss": 0.3777, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15230, "tokens_per_second_per_gpu": 2354.55 }, { "epoch": 1.5218693828639904, "grad_norm": 3.03125, "learning_rate": 1.7116325870767192e-05, "loss": 0.4081, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15240, "tokens_per_second_per_gpu": 2435.87 }, { "epoch": 1.5228679848212503, "grad_norm": 3.109375, "learning_rate": 1.710721010689669e-05, "loss": 0.3687, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15250, "tokens_per_second_per_gpu": 2706.45 }, { "epoch": 1.5238665867785102, "grad_norm": 2.6875, "learning_rate": 1.7098091506742926e-05, "loss": 0.3586, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15260, "tokens_per_second_per_gpu": 2560.12 }, { "epoch": 1.5248651887357698, "grad_norm": 2.59375, "learning_rate": 1.7088970075919465e-05, "loss": 0.3695, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15270, "tokens_per_second_per_gpu": 2251.71 }, { "epoch": 1.52586379069303, "grad_norm": 3.203125, "learning_rate": 1.7079845820041628e-05, "loss": 0.4037, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15280, "tokens_per_second_per_gpu": 2287.59 }, { "epoch": 1.5268623926502896, "grad_norm": 2.390625, "learning_rate": 1.7070718744726477e-05, "loss": 0.3505, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15290, "tokens_per_second_per_gpu": 2505.53 }, { "epoch": 1.5278609946075494, "grad_norm": 2.859375, "learning_rate": 1.70615888555928e-05, "loss": 0.3858, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15300, "tokens_per_second_per_gpu": 2492.08 }, { "epoch": 1.5288595965648093, "grad_norm": 2.640625, "learning_rate": 1.705245615826114e-05, "loss": 0.3847, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15310, "tokens_per_second_per_gpu": 2296.69 }, { "epoch": 1.529858198522069, "grad_norm": 3.546875, "learning_rate": 1.7043320658353744e-05, "loss": 0.3619, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15320, "tokens_per_second_per_gpu": 2294.72 }, { "epoch": 1.530856800479329, "grad_norm": 3.125, "learning_rate": 1.7034182361494588e-05, "loss": 0.3842, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15330, "tokens_per_second_per_gpu": 2523.71 }, { "epoch": 1.5318554024365887, "grad_norm": 3.859375, "learning_rate": 1.7025041273309383e-05, "loss": 0.388, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15340, "tokens_per_second_per_gpu": 2586.04 }, { "epoch": 1.5328540043938486, "grad_norm": 3.34375, "learning_rate": 1.7015897399425558e-05, "loss": 0.3892, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15350, "tokens_per_second_per_gpu": 2483.57 }, { "epoch": 1.5338526063511084, "grad_norm": 3.5, "learning_rate": 1.7006750745472234e-05, "loss": 0.3613, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15360, "tokens_per_second_per_gpu": 2394.34 }, { "epoch": 1.5348512083083683, "grad_norm": 4.0, "learning_rate": 1.699760131708027e-05, "loss": 0.4003, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15370, "tokens_per_second_per_gpu": 2340.48 }, { "epoch": 1.5358498102656282, "grad_norm": 4.0625, "learning_rate": 1.6988449119882222e-05, "loss": 0.4202, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15380, "tokens_per_second_per_gpu": 2560.43 }, { "epoch": 1.5368484122228878, "grad_norm": 3.484375, "learning_rate": 1.6979294159512342e-05, "loss": 0.3417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15390, "tokens_per_second_per_gpu": 2428.61 }, { "epoch": 1.537847014180148, "grad_norm": 4.0625, "learning_rate": 1.6970136441606604e-05, "loss": 0.408, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15400, "tokens_per_second_per_gpu": 2555.33 }, { "epoch": 1.5388456161374076, "grad_norm": 3.046875, "learning_rate": 1.6960975971802663e-05, "loss": 0.349, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15410, "tokens_per_second_per_gpu": 2448.83 }, { "epoch": 1.5398442180946674, "grad_norm": 2.828125, "learning_rate": 1.695181275573987e-05, "loss": 0.3828, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15420, "tokens_per_second_per_gpu": 2488.02 }, { "epoch": 1.5408428200519273, "grad_norm": 2.90625, "learning_rate": 1.694264679905927e-05, "loss": 0.3796, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15430, "tokens_per_second_per_gpu": 2451.29 }, { "epoch": 1.5418414220091872, "grad_norm": 3.703125, "learning_rate": 1.69334781074036e-05, "loss": 0.3655, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15440, "tokens_per_second_per_gpu": 2449.66 }, { "epoch": 1.542840023966447, "grad_norm": 4.03125, "learning_rate": 1.6924306686417265e-05, "loss": 0.361, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15450, "tokens_per_second_per_gpu": 2463.82 }, { "epoch": 1.5438386259237067, "grad_norm": 2.578125, "learning_rate": 1.6915132541746367e-05, "loss": 0.4395, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15460, "tokens_per_second_per_gpu": 2592.62 }, { "epoch": 1.5448372278809668, "grad_norm": 3.4375, "learning_rate": 1.690595567903868e-05, "loss": 0.4053, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15470, "tokens_per_second_per_gpu": 2595.8 }, { "epoch": 1.5458358298382264, "grad_norm": 3.328125, "learning_rate": 1.689677610394365e-05, "loss": 0.3858, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15480, "tokens_per_second_per_gpu": 2285.48 }, { "epoch": 1.5468344317954863, "grad_norm": 2.75, "learning_rate": 1.6887593822112382e-05, "loss": 0.377, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15490, "tokens_per_second_per_gpu": 2371.52 }, { "epoch": 1.5478330337527462, "grad_norm": 2.765625, "learning_rate": 1.687840883919767e-05, "loss": 0.3866, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15500, "tokens_per_second_per_gpu": 2541.41 }, { "epoch": 1.548831635710006, "grad_norm": 2.765625, "learning_rate": 1.686922116085395e-05, "loss": 0.376, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15510, "tokens_per_second_per_gpu": 2453.76 }, { "epoch": 1.549830237667266, "grad_norm": 3.203125, "learning_rate": 1.6860030792737332e-05, "loss": 0.3924, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15520, "tokens_per_second_per_gpu": 2531.89 }, { "epoch": 1.5508288396245256, "grad_norm": 2.90625, "learning_rate": 1.685083774050557e-05, "loss": 0.3417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15530, "tokens_per_second_per_gpu": 2579.53 }, { "epoch": 1.5518274415817856, "grad_norm": 3.5625, "learning_rate": 1.684164200981808e-05, "loss": 0.3608, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15540, "tokens_per_second_per_gpu": 2339.43 }, { "epoch": 1.5528260435390453, "grad_norm": 2.5625, "learning_rate": 1.6832443606335924e-05, "loss": 0.387, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15550, "tokens_per_second_per_gpu": 2311.16 }, { "epoch": 1.5538246454963052, "grad_norm": 3.484375, "learning_rate": 1.6823242535721803e-05, "loss": 0.399, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15560, "tokens_per_second_per_gpu": 2565.8 }, { "epoch": 1.554823247453565, "grad_norm": 2.578125, "learning_rate": 1.681403880364007e-05, "loss": 0.4166, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15570, "tokens_per_second_per_gpu": 2440.27 }, { "epoch": 1.5558218494108247, "grad_norm": 3.265625, "learning_rate": 1.6804832415756706e-05, "loss": 0.3627, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15580, "tokens_per_second_per_gpu": 2443.36 }, { "epoch": 1.5568204513680848, "grad_norm": 3.828125, "learning_rate": 1.6795623377739335e-05, "loss": 0.3837, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15590, "tokens_per_second_per_gpu": 2357.12 }, { "epoch": 1.5578190533253444, "grad_norm": 2.609375, "learning_rate": 1.678641169525722e-05, "loss": 0.3512, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15600, "tokens_per_second_per_gpu": 2593.26 }, { "epoch": 1.5588176552826045, "grad_norm": 3.53125, "learning_rate": 1.677719737398123e-05, "loss": 0.3627, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15610, "tokens_per_second_per_gpu": 2327.69 }, { "epoch": 1.5598162572398642, "grad_norm": 4.125, "learning_rate": 1.6767980419583874e-05, "loss": 0.411, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15620, "tokens_per_second_per_gpu": 2452.49 }, { "epoch": 1.560814859197124, "grad_norm": 3.09375, "learning_rate": 1.6758760837739275e-05, "loss": 0.3824, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15630, "tokens_per_second_per_gpu": 2539.93 }, { "epoch": 1.561813461154384, "grad_norm": 3.25, "learning_rate": 1.674953863412319e-05, "loss": 0.3425, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15640, "tokens_per_second_per_gpu": 2477.64 }, { "epoch": 1.5628120631116436, "grad_norm": 3.796875, "learning_rate": 1.674031381441296e-05, "loss": 0.3683, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15650, "tokens_per_second_per_gpu": 2301.23 }, { "epoch": 1.5638106650689036, "grad_norm": 3.796875, "learning_rate": 1.6731086384287564e-05, "loss": 0.4098, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15660, "tokens_per_second_per_gpu": 2536.25 }, { "epoch": 1.5648092670261633, "grad_norm": 3.28125, "learning_rate": 1.6721856349427582e-05, "loss": 0.3826, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15670, "tokens_per_second_per_gpu": 2320.2 }, { "epoch": 1.5658078689834232, "grad_norm": 3.390625, "learning_rate": 1.671262371551518e-05, "loss": 0.4028, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15680, "tokens_per_second_per_gpu": 2408.37 }, { "epoch": 1.566806470940683, "grad_norm": 3.359375, "learning_rate": 1.670338848823415e-05, "loss": 0.3773, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15690, "tokens_per_second_per_gpu": 2363.74 }, { "epoch": 1.567805072897943, "grad_norm": 4.6875, "learning_rate": 1.669415067326986e-05, "loss": 0.3719, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15700, "tokens_per_second_per_gpu": 2523.0 }, { "epoch": 1.5688036748552028, "grad_norm": 2.234375, "learning_rate": 1.668491027630928e-05, "loss": 0.346, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15710, "tokens_per_second_per_gpu": 2477.53 }, { "epoch": 1.5698022768124624, "grad_norm": 3.609375, "learning_rate": 1.6675667303040965e-05, "loss": 0.3545, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15720, "tokens_per_second_per_gpu": 2294.56 }, { "epoch": 1.5708008787697225, "grad_norm": 2.828125, "learning_rate": 1.6666421759155072e-05, "loss": 0.3649, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15730, "tokens_per_second_per_gpu": 2439.22 }, { "epoch": 1.5717994807269822, "grad_norm": 3.21875, "learning_rate": 1.6657173650343315e-05, "loss": 0.3792, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15740, "tokens_per_second_per_gpu": 2629.06 }, { "epoch": 1.572798082684242, "grad_norm": 2.96875, "learning_rate": 1.6647922982299004e-05, "loss": 0.3692, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15750, "tokens_per_second_per_gpu": 2444.12 }, { "epoch": 1.573796684641502, "grad_norm": 2.703125, "learning_rate": 1.663866976071702e-05, "loss": 0.3387, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15760, "tokens_per_second_per_gpu": 2471.75 }, { "epoch": 1.5747952865987618, "grad_norm": 3.71875, "learning_rate": 1.6629413991293815e-05, "loss": 0.4209, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15770, "tokens_per_second_per_gpu": 2525.28 }, { "epoch": 1.5757938885560216, "grad_norm": 2.984375, "learning_rate": 1.6620155679727417e-05, "loss": 0.4158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15780, "tokens_per_second_per_gpu": 2492.31 }, { "epoch": 1.5767924905132813, "grad_norm": 3.328125, "learning_rate": 1.6610894831717406e-05, "loss": 0.369, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15790, "tokens_per_second_per_gpu": 2576.39 }, { "epoch": 1.5777910924705414, "grad_norm": 3.109375, "learning_rate": 1.6601631452964935e-05, "loss": 0.3386, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15800, "tokens_per_second_per_gpu": 2392.57 }, { "epoch": 1.578789694427801, "grad_norm": 2.640625, "learning_rate": 1.6592365549172712e-05, "loss": 0.2971, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15810, "tokens_per_second_per_gpu": 2363.93 }, { "epoch": 1.579788296385061, "grad_norm": 4.78125, "learning_rate": 1.6583097126044986e-05, "loss": 0.4657, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15820, "tokens_per_second_per_gpu": 2339.79 }, { "epoch": 1.5807868983423208, "grad_norm": 3.359375, "learning_rate": 1.657382618928758e-05, "loss": 0.4158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15830, "tokens_per_second_per_gpu": 2494.56 }, { "epoch": 1.5817855002995804, "grad_norm": 4.09375, "learning_rate": 1.656455274460785e-05, "loss": 0.4108, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15840, "tokens_per_second_per_gpu": 2492.41 }, { "epoch": 1.5827841022568405, "grad_norm": 3.359375, "learning_rate": 1.65552767977147e-05, "loss": 0.3768, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15850, "tokens_per_second_per_gpu": 2358.51 }, { "epoch": 1.5837827042141002, "grad_norm": 2.9375, "learning_rate": 1.6545998354318573e-05, "loss": 0.4054, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15860, "tokens_per_second_per_gpu": 2381.98 }, { "epoch": 1.5847813061713603, "grad_norm": 3.59375, "learning_rate": 1.653671742013145e-05, "loss": 0.4184, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15870, "tokens_per_second_per_gpu": 2358.26 }, { "epoch": 1.58577990812862, "grad_norm": 3.21875, "learning_rate": 1.652743400086684e-05, "loss": 0.3683, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15880, "tokens_per_second_per_gpu": 2442.09 }, { "epoch": 1.5867785100858798, "grad_norm": 3.359375, "learning_rate": 1.6518148102239792e-05, "loss": 0.3516, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15890, "tokens_per_second_per_gpu": 2414.76 }, { "epoch": 1.5877771120431396, "grad_norm": 3.671875, "learning_rate": 1.6508859729966877e-05, "loss": 0.3894, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15900, "tokens_per_second_per_gpu": 2349.89 }, { "epoch": 1.5887757140003993, "grad_norm": 4.40625, "learning_rate": 1.6499568889766182e-05, "loss": 0.4086, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15910, "tokens_per_second_per_gpu": 2403.98 }, { "epoch": 1.5897743159576594, "grad_norm": 3.671875, "learning_rate": 1.6490275587357323e-05, "loss": 0.4302, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15920, "tokens_per_second_per_gpu": 2604.09 }, { "epoch": 1.590772917914919, "grad_norm": 3.859375, "learning_rate": 1.648097982846143e-05, "loss": 0.4009, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15930, "tokens_per_second_per_gpu": 2428.34 }, { "epoch": 1.591771519872179, "grad_norm": 3.28125, "learning_rate": 1.6471681618801138e-05, "loss": 0.3566, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15940, "tokens_per_second_per_gpu": 2474.83 }, { "epoch": 1.5927701218294388, "grad_norm": 3.1875, "learning_rate": 1.6462380964100597e-05, "loss": 0.3766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15950, "tokens_per_second_per_gpu": 2553.73 }, { "epoch": 1.5937687237866986, "grad_norm": 4.03125, "learning_rate": 1.6453077870085464e-05, "loss": 0.4132, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15960, "tokens_per_second_per_gpu": 2416.48 }, { "epoch": 1.5947673257439585, "grad_norm": 3.53125, "learning_rate": 1.644377234248289e-05, "loss": 0.3492, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15970, "tokens_per_second_per_gpu": 2423.97 }, { "epoch": 1.5957659277012182, "grad_norm": 3.375, "learning_rate": 1.6434464387021535e-05, "loss": 0.4313, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15980, "tokens_per_second_per_gpu": 2272.91 }, { "epoch": 1.5967645296584783, "grad_norm": 3.859375, "learning_rate": 1.6425154009431547e-05, "loss": 0.4121, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 15990, "tokens_per_second_per_gpu": 2213.04 }, { "epoch": 1.597763131615738, "grad_norm": 2.21875, "learning_rate": 1.6415841215444556e-05, "loss": 0.337, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16000, "tokens_per_second_per_gpu": 2411.7 }, { "epoch": 1.5987617335729978, "grad_norm": 3.703125, "learning_rate": 1.6406526010793704e-05, "loss": 0.3731, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16010, "tokens_per_second_per_gpu": 2485.84 }, { "epoch": 1.5997603355302576, "grad_norm": 3.03125, "learning_rate": 1.639720840121359e-05, "loss": 0.3673, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16020, "tokens_per_second_per_gpu": 2200.41 }, { "epoch": 1.6007589374875175, "grad_norm": 3.984375, "learning_rate": 1.6387888392440312e-05, "loss": 0.3673, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16030, "tokens_per_second_per_gpu": 2526.89 }, { "epoch": 1.6017575394447774, "grad_norm": 3.078125, "learning_rate": 1.637856599021144e-05, "loss": 0.379, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16040, "tokens_per_second_per_gpu": 2396.35 }, { "epoch": 1.602756141402037, "grad_norm": 2.953125, "learning_rate": 1.6369241200266007e-05, "loss": 0.3974, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16050, "tokens_per_second_per_gpu": 2587.53 }, { "epoch": 1.6037547433592971, "grad_norm": 3.109375, "learning_rate": 1.6359914028344535e-05, "loss": 0.371, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16060, "tokens_per_second_per_gpu": 2534.41 }, { "epoch": 1.6047533453165568, "grad_norm": 2.96875, "learning_rate": 1.6350584480189004e-05, "loss": 0.4175, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16070, "tokens_per_second_per_gpu": 2456.17 }, { "epoch": 1.6057519472738166, "grad_norm": 3.046875, "learning_rate": 1.6341252561542848e-05, "loss": 0.378, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16080, "tokens_per_second_per_gpu": 2438.63 }, { "epoch": 1.6067505492310765, "grad_norm": 3.46875, "learning_rate": 1.6331918278150964e-05, "loss": 0.351, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16090, "tokens_per_second_per_gpu": 2432.66 }, { "epoch": 1.6077491511883362, "grad_norm": 4.03125, "learning_rate": 1.6322581635759725e-05, "loss": 0.4051, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16100, "tokens_per_second_per_gpu": 2343.49 }, { "epoch": 1.6087477531455963, "grad_norm": 3.921875, "learning_rate": 1.6313242640116923e-05, "loss": 0.3997, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16110, "tokens_per_second_per_gpu": 2388.94 }, { "epoch": 1.609746355102856, "grad_norm": 5.5625, "learning_rate": 1.6303901296971826e-05, "loss": 0.3979, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16120, "tokens_per_second_per_gpu": 2396.16 }, { "epoch": 1.610744957060116, "grad_norm": 3.28125, "learning_rate": 1.6294557612075136e-05, "loss": 0.421, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16130, "tokens_per_second_per_gpu": 2487.29 }, { "epoch": 1.6117435590173756, "grad_norm": 3.375, "learning_rate": 1.6285211591178986e-05, "loss": 0.3468, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16140, "tokens_per_second_per_gpu": 2490.75 }, { "epoch": 1.6127421609746355, "grad_norm": 3.0625, "learning_rate": 1.627586324003697e-05, "loss": 0.3853, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16150, "tokens_per_second_per_gpu": 2722.54 }, { "epoch": 1.6137407629318954, "grad_norm": 3.671875, "learning_rate": 1.62665125644041e-05, "loss": 0.4085, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16160, "tokens_per_second_per_gpu": 2384.13 }, { "epoch": 1.614739364889155, "grad_norm": 3.390625, "learning_rate": 1.6257159570036828e-05, "loss": 0.4367, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16170, "tokens_per_second_per_gpu": 2349.83 }, { "epoch": 1.6157379668464151, "grad_norm": 2.59375, "learning_rate": 1.624780426269303e-05, "loss": 0.3996, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16180, "tokens_per_second_per_gpu": 2491.19 }, { "epoch": 1.6167365688036748, "grad_norm": 3.421875, "learning_rate": 1.6238446648131996e-05, "loss": 0.3803, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16190, "tokens_per_second_per_gpu": 2273.36 }, { "epoch": 1.6177351707609346, "grad_norm": 2.28125, "learning_rate": 1.622908673211445e-05, "loss": 0.3972, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16200, "tokens_per_second_per_gpu": 2573.93 }, { "epoch": 1.6187337727181945, "grad_norm": 3.578125, "learning_rate": 1.6219724520402537e-05, "loss": 0.3444, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16210, "tokens_per_second_per_gpu": 2334.85 }, { "epoch": 1.6197323746754544, "grad_norm": 3.75, "learning_rate": 1.6210360018759795e-05, "loss": 0.3539, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16220, "tokens_per_second_per_gpu": 2392.04 }, { "epoch": 1.6207309766327143, "grad_norm": 3.609375, "learning_rate": 1.6200993232951188e-05, "loss": 0.4295, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16230, "tokens_per_second_per_gpu": 2191.63 }, { "epoch": 1.621729578589974, "grad_norm": 4.53125, "learning_rate": 1.6191624168743086e-05, "loss": 0.4148, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16240, "tokens_per_second_per_gpu": 2509.97 }, { "epoch": 1.622728180547234, "grad_norm": 2.65625, "learning_rate": 1.618225283190325e-05, "loss": 0.3348, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16250, "tokens_per_second_per_gpu": 2371.44 }, { "epoch": 1.6237267825044936, "grad_norm": 2.8125, "learning_rate": 1.617287922820085e-05, "loss": 0.3699, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16260, "tokens_per_second_per_gpu": 2487.26 }, { "epoch": 1.6247253844617535, "grad_norm": 2.703125, "learning_rate": 1.6163503363406452e-05, "loss": 0.3729, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16270, "tokens_per_second_per_gpu": 2480.94 }, { "epoch": 1.6257239864190134, "grad_norm": 2.78125, "learning_rate": 1.6154125243292e-05, "loss": 0.3931, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16280, "tokens_per_second_per_gpu": 2660.41 }, { "epoch": 1.6267225883762733, "grad_norm": 2.84375, "learning_rate": 1.614474487363085e-05, "loss": 0.3987, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16290, "tokens_per_second_per_gpu": 2618.38 }, { "epoch": 1.6277211903335331, "grad_norm": 4.1875, "learning_rate": 1.6135362260197728e-05, "loss": 0.366, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16300, "tokens_per_second_per_gpu": 2336.9 }, { "epoch": 1.6287197922907928, "grad_norm": 3.609375, "learning_rate": 1.6125977408768737e-05, "loss": 0.381, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16310, "tokens_per_second_per_gpu": 2433.41 }, { "epoch": 1.6297183942480529, "grad_norm": 2.796875, "learning_rate": 1.611659032512137e-05, "loss": 0.3988, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16320, "tokens_per_second_per_gpu": 2321.94 }, { "epoch": 1.6307169962053125, "grad_norm": 3.75, "learning_rate": 1.6107201015034486e-05, "loss": 0.3719, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16330, "tokens_per_second_per_gpu": 2450.81 }, { "epoch": 1.6317155981625724, "grad_norm": 3.28125, "learning_rate": 1.6097809484288314e-05, "loss": 0.3691, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16340, "tokens_per_second_per_gpu": 2517.35 }, { "epoch": 1.6327142001198323, "grad_norm": 2.75, "learning_rate": 1.6088415738664458e-05, "loss": 0.3809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16350, "tokens_per_second_per_gpu": 2578.67 }, { "epoch": 1.633712802077092, "grad_norm": 3.828125, "learning_rate": 1.607901978394588e-05, "loss": 0.3481, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16360, "tokens_per_second_per_gpu": 2320.32 }, { "epoch": 1.634711404034352, "grad_norm": 3.296875, "learning_rate": 1.6069621625916897e-05, "loss": 0.3385, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16370, "tokens_per_second_per_gpu": 2256.75 }, { "epoch": 1.6357100059916116, "grad_norm": 2.890625, "learning_rate": 1.60602212703632e-05, "loss": 0.418, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16380, "tokens_per_second_per_gpu": 2485.82 }, { "epoch": 1.6367086079488717, "grad_norm": 3.34375, "learning_rate": 1.6050818723071808e-05, "loss": 0.3232, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16390, "tokens_per_second_per_gpu": 2303.64 }, { "epoch": 1.6377072099061314, "grad_norm": 2.796875, "learning_rate": 1.6041413989831105e-05, "loss": 0.351, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16400, "tokens_per_second_per_gpu": 2562.14 }, { "epoch": 1.6387058118633913, "grad_norm": 3.703125, "learning_rate": 1.603200707643082e-05, "loss": 0.3725, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16410, "tokens_per_second_per_gpu": 2253.22 }, { "epoch": 1.6397044138206511, "grad_norm": 3.578125, "learning_rate": 1.6022597988662026e-05, "loss": 0.3885, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16420, "tokens_per_second_per_gpu": 2341.45 }, { "epoch": 1.6407030157779108, "grad_norm": 3.671875, "learning_rate": 1.601318673231712e-05, "loss": 0.3723, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16430, "tokens_per_second_per_gpu": 2308.59 }, { "epoch": 1.6417016177351709, "grad_norm": 3.328125, "learning_rate": 1.6003773313189853e-05, "loss": 0.3951, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16440, "tokens_per_second_per_gpu": 2468.38 }, { "epoch": 1.6427002196924305, "grad_norm": 2.765625, "learning_rate": 1.59943577370753e-05, "loss": 0.3699, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16450, "tokens_per_second_per_gpu": 2404.55 }, { "epoch": 1.6436988216496904, "grad_norm": 2.734375, "learning_rate": 1.5984940009769857e-05, "loss": 0.4027, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16460, "tokens_per_second_per_gpu": 2437.55 }, { "epoch": 1.6446974236069503, "grad_norm": 2.984375, "learning_rate": 1.597552013707125e-05, "loss": 0.3921, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16470, "tokens_per_second_per_gpu": 2317.36 }, { "epoch": 1.6456960255642101, "grad_norm": 3.921875, "learning_rate": 1.5966098124778528e-05, "loss": 0.3389, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16480, "tokens_per_second_per_gpu": 2460.72 }, { "epoch": 1.64669462752147, "grad_norm": 3.859375, "learning_rate": 1.5956673978692054e-05, "loss": 0.3789, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16490, "tokens_per_second_per_gpu": 2402.74 }, { "epoch": 1.6476932294787296, "grad_norm": 2.765625, "learning_rate": 1.5947247704613513e-05, "loss": 0.3663, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16500, "tokens_per_second_per_gpu": 2576.99 }, { "epoch": 1.6486918314359897, "grad_norm": 2.78125, "learning_rate": 1.5937819308345885e-05, "loss": 0.3895, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16510, "tokens_per_second_per_gpu": 2552.62 }, { "epoch": 1.6496904333932494, "grad_norm": 3.875, "learning_rate": 1.5928388795693462e-05, "loss": 0.3904, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16520, "tokens_per_second_per_gpu": 2218.15 }, { "epoch": 1.6506890353505093, "grad_norm": 2.90625, "learning_rate": 1.5918956172461852e-05, "loss": 0.3967, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16530, "tokens_per_second_per_gpu": 2341.83 }, { "epoch": 1.6516876373077691, "grad_norm": 3.046875, "learning_rate": 1.5909521444457935e-05, "loss": 0.387, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16540, "tokens_per_second_per_gpu": 2432.0 }, { "epoch": 1.652686239265029, "grad_norm": 4.0, "learning_rate": 1.5900084617489915e-05, "loss": 0.417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16550, "tokens_per_second_per_gpu": 2387.53 }, { "epoch": 1.6536848412222889, "grad_norm": 4.125, "learning_rate": 1.589064569736728e-05, "loss": 0.3809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16560, "tokens_per_second_per_gpu": 2444.95 }, { "epoch": 1.6546834431795485, "grad_norm": 3.359375, "learning_rate": 1.588120468990079e-05, "loss": 0.3531, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16570, "tokens_per_second_per_gpu": 2408.9 }, { "epoch": 1.6556820451368086, "grad_norm": 3.375, "learning_rate": 1.587176160090251e-05, "loss": 0.3421, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16580, "tokens_per_second_per_gpu": 2414.67 }, { "epoch": 1.6566806470940683, "grad_norm": 3.46875, "learning_rate": 1.5862316436185782e-05, "loss": 0.4134, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16590, "tokens_per_second_per_gpu": 2447.99 }, { "epoch": 1.6576792490513281, "grad_norm": 3.640625, "learning_rate": 1.5852869201565212e-05, "loss": 0.3659, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16600, "tokens_per_second_per_gpu": 2281.18 }, { "epoch": 1.658677851008588, "grad_norm": 4.3125, "learning_rate": 1.58434199028567e-05, "loss": 0.4214, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16610, "tokens_per_second_per_gpu": 2543.05 }, { "epoch": 1.6596764529658476, "grad_norm": 3.671875, "learning_rate": 1.5833968545877414e-05, "loss": 0.3908, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16620, "tokens_per_second_per_gpu": 2384.86 }, { "epoch": 1.6606750549231077, "grad_norm": 4.375, "learning_rate": 1.582451513644577e-05, "loss": 0.3862, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16630, "tokens_per_second_per_gpu": 2239.67 }, { "epoch": 1.6616736568803674, "grad_norm": 2.515625, "learning_rate": 1.581505968038147e-05, "loss": 0.3207, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16640, "tokens_per_second_per_gpu": 2575.84 }, { "epoch": 1.6626722588376275, "grad_norm": 2.890625, "learning_rate": 1.5805602183505465e-05, "loss": 0.335, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16650, "tokens_per_second_per_gpu": 2503.88 }, { "epoch": 1.6636708607948871, "grad_norm": 3.421875, "learning_rate": 1.5796142651639957e-05, "loss": 0.3879, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16660, "tokens_per_second_per_gpu": 2404.77 }, { "epoch": 1.664669462752147, "grad_norm": 3.859375, "learning_rate": 1.5786681090608416e-05, "loss": 0.4161, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16670, "tokens_per_second_per_gpu": 2511.88 }, { "epoch": 1.6656680647094069, "grad_norm": 2.546875, "learning_rate": 1.5777217506235548e-05, "loss": 0.3874, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16680, "tokens_per_second_per_gpu": 2527.54 }, { "epoch": 1.6666666666666665, "grad_norm": 3.453125, "learning_rate": 1.5767751904347317e-05, "loss": 0.3629, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16690, "tokens_per_second_per_gpu": 2237.65 }, { "epoch": 1.6676652686239266, "grad_norm": 3.015625, "learning_rate": 1.5758284290770914e-05, "loss": 0.364, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16700, "tokens_per_second_per_gpu": 2209.74 }, { "epoch": 1.6686638705811863, "grad_norm": 2.3125, "learning_rate": 1.5748814671334776e-05, "loss": 0.3857, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16710, "tokens_per_second_per_gpu": 2325.27 }, { "epoch": 1.6696624725384461, "grad_norm": 3.265625, "learning_rate": 1.5739343051868575e-05, "loss": 0.3689, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16720, "tokens_per_second_per_gpu": 2341.77 }, { "epoch": 1.670661074495706, "grad_norm": 3.578125, "learning_rate": 1.5729869438203224e-05, "loss": 0.3742, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16730, "tokens_per_second_per_gpu": 2231.08 }, { "epoch": 1.6716596764529659, "grad_norm": 3.90625, "learning_rate": 1.572039383617084e-05, "loss": 0.4107, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16740, "tokens_per_second_per_gpu": 2485.55 }, { "epoch": 1.6726582784102257, "grad_norm": 3.21875, "learning_rate": 1.5710916251604784e-05, "loss": 0.4331, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16750, "tokens_per_second_per_gpu": 2454.2 }, { "epoch": 1.6736568803674854, "grad_norm": 3.25, "learning_rate": 1.5701436690339638e-05, "loss": 0.452, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16760, "tokens_per_second_per_gpu": 2433.6 }, { "epoch": 1.6746554823247455, "grad_norm": 3.546875, "learning_rate": 1.5691955158211184e-05, "loss": 0.3565, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16770, "tokens_per_second_per_gpu": 2365.04 }, { "epoch": 1.6756540842820051, "grad_norm": 2.984375, "learning_rate": 1.568247166105643e-05, "loss": 0.349, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16780, "tokens_per_second_per_gpu": 2299.09 }, { "epoch": 1.676652686239265, "grad_norm": 3.8125, "learning_rate": 1.5672986204713598e-05, "loss": 0.3882, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16790, "tokens_per_second_per_gpu": 2208.28 }, { "epoch": 1.6776512881965249, "grad_norm": 5.34375, "learning_rate": 1.5663498795022097e-05, "loss": 0.4115, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16800, "tokens_per_second_per_gpu": 2439.72 }, { "epoch": 1.6786498901537847, "grad_norm": 3.390625, "learning_rate": 1.565400943782256e-05, "loss": 0.3846, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16810, "tokens_per_second_per_gpu": 2473.8 }, { "epoch": 1.6796484921110446, "grad_norm": 3.953125, "learning_rate": 1.5644518138956807e-05, "loss": 0.3985, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16820, "tokens_per_second_per_gpu": 2551.87 }, { "epoch": 1.6806470940683043, "grad_norm": 2.890625, "learning_rate": 1.563502490426786e-05, "loss": 0.3768, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16830, "tokens_per_second_per_gpu": 2427.68 }, { "epoch": 1.6816456960255644, "grad_norm": 3.5625, "learning_rate": 1.562552973959992e-05, "loss": 0.4268, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16840, "tokens_per_second_per_gpu": 2554.16 }, { "epoch": 1.682644297982824, "grad_norm": 3.90625, "learning_rate": 1.5616032650798397e-05, "loss": 0.38, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16850, "tokens_per_second_per_gpu": 2473.33 }, { "epoch": 1.6836428999400839, "grad_norm": 3.078125, "learning_rate": 1.5606533643709865e-05, "loss": 0.3558, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16860, "tokens_per_second_per_gpu": 2437.43 }, { "epoch": 1.6846415018973437, "grad_norm": 3.265625, "learning_rate": 1.5597032724182085e-05, "loss": 0.3928, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16870, "tokens_per_second_per_gpu": 2344.73 }, { "epoch": 1.6856401038546034, "grad_norm": 3.34375, "learning_rate": 1.5587529898064008e-05, "loss": 0.387, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16880, "tokens_per_second_per_gpu": 2487.74 }, { "epoch": 1.6866387058118635, "grad_norm": 2.84375, "learning_rate": 1.5578025171205742e-05, "loss": 0.4158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16890, "tokens_per_second_per_gpu": 2609.74 }, { "epoch": 1.6876373077691231, "grad_norm": 2.765625, "learning_rate": 1.556851854945857e-05, "loss": 0.4049, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16900, "tokens_per_second_per_gpu": 2541.82 }, { "epoch": 1.6886359097263832, "grad_norm": 2.9375, "learning_rate": 1.5559010038674947e-05, "loss": 0.3692, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16910, "tokens_per_second_per_gpu": 2474.13 }, { "epoch": 1.6896345116836429, "grad_norm": 3.5625, "learning_rate": 1.5549499644708487e-05, "loss": 0.3821, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16920, "tokens_per_second_per_gpu": 2335.96 }, { "epoch": 1.6906331136409027, "grad_norm": 4.15625, "learning_rate": 1.5539987373413965e-05, "loss": 0.413, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16930, "tokens_per_second_per_gpu": 2518.36 }, { "epoch": 1.6916317155981626, "grad_norm": 3.453125, "learning_rate": 1.55304732306473e-05, "loss": 0.3526, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16940, "tokens_per_second_per_gpu": 2397.02 }, { "epoch": 1.6926303175554223, "grad_norm": 3.375, "learning_rate": 1.5520957222265587e-05, "loss": 0.3766, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16950, "tokens_per_second_per_gpu": 2398.92 }, { "epoch": 1.6936289195126824, "grad_norm": 3.5625, "learning_rate": 1.551143935412705e-05, "loss": 0.3504, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16960, "tokens_per_second_per_gpu": 2427.84 }, { "epoch": 1.694627521469942, "grad_norm": 2.84375, "learning_rate": 1.550191963209106e-05, "loss": 0.4094, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16970, "tokens_per_second_per_gpu": 2456.33 }, { "epoch": 1.6956261234272019, "grad_norm": 3.8125, "learning_rate": 1.549239806201813e-05, "loss": 0.4063, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16980, "tokens_per_second_per_gpu": 2485.78 }, { "epoch": 1.6966247253844617, "grad_norm": 3.65625, "learning_rate": 1.548287464976993e-05, "loss": 0.3787, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 16990, "tokens_per_second_per_gpu": 2438.8 }, { "epoch": 1.6976233273417216, "grad_norm": 3.875, "learning_rate": 1.5473349401209235e-05, "loss": 0.3975, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17000, "tokens_per_second_per_gpu": 2492.42 }, { "epoch": 1.6986219292989815, "grad_norm": 3.34375, "learning_rate": 1.5463822322199965e-05, "loss": 0.346, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17010, "tokens_per_second_per_gpu": 2476.92 }, { "epoch": 1.6996205312562411, "grad_norm": 3.875, "learning_rate": 1.5454293418607165e-05, "loss": 0.4391, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17020, "tokens_per_second_per_gpu": 2477.57 }, { "epoch": 1.7006191332135012, "grad_norm": 3.78125, "learning_rate": 1.5444762696297e-05, "loss": 0.4231, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17030, "tokens_per_second_per_gpu": 2519.85 }, { "epoch": 1.7016177351707609, "grad_norm": 2.609375, "learning_rate": 1.543523016113677e-05, "loss": 0.3371, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17040, "tokens_per_second_per_gpu": 2523.54 }, { "epoch": 1.7026163371280207, "grad_norm": 3.71875, "learning_rate": 1.5425695818994866e-05, "loss": 0.3675, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17050, "tokens_per_second_per_gpu": 2325.39 }, { "epoch": 1.7036149390852806, "grad_norm": 3.34375, "learning_rate": 1.541615967574081e-05, "loss": 0.3579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17060, "tokens_per_second_per_gpu": 2346.2 }, { "epoch": 1.7046135410425405, "grad_norm": 3.140625, "learning_rate": 1.5406621737245226e-05, "loss": 0.3883, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17070, "tokens_per_second_per_gpu": 2495.58 }, { "epoch": 1.7056121429998004, "grad_norm": 2.625, "learning_rate": 1.5397082009379846e-05, "loss": 0.3456, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17080, "tokens_per_second_per_gpu": 2488.74 }, { "epoch": 1.70661074495706, "grad_norm": 3.109375, "learning_rate": 1.53875404980175e-05, "loss": 0.3776, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17090, "tokens_per_second_per_gpu": 2355.56 }, { "epoch": 1.70760934691432, "grad_norm": 3.296875, "learning_rate": 1.5377997209032118e-05, "loss": 0.3787, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17100, "tokens_per_second_per_gpu": 2322.53 }, { "epoch": 1.7086079488715797, "grad_norm": 3.140625, "learning_rate": 1.5368452148298727e-05, "loss": 0.3667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17110, "tokens_per_second_per_gpu": 2528.23 }, { "epoch": 1.7096065508288396, "grad_norm": 3.328125, "learning_rate": 1.5358905321693437e-05, "loss": 0.4305, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17120, "tokens_per_second_per_gpu": 2625.61 }, { "epoch": 1.7106051527860995, "grad_norm": 2.75, "learning_rate": 1.5349356735093456e-05, "loss": 0.3983, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17130, "tokens_per_second_per_gpu": 2477.55 }, { "epoch": 1.7116037547433594, "grad_norm": 2.921875, "learning_rate": 1.533980639437706e-05, "loss": 0.4107, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17140, "tokens_per_second_per_gpu": 2471.1 }, { "epoch": 1.7126023567006192, "grad_norm": 2.84375, "learning_rate": 1.533025430542363e-05, "loss": 0.3888, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17150, "tokens_per_second_per_gpu": 2560.99 }, { "epoch": 1.7136009586578789, "grad_norm": 4.53125, "learning_rate": 1.5320700474113594e-05, "loss": 0.3857, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17160, "tokens_per_second_per_gpu": 2364.04 }, { "epoch": 1.714599560615139, "grad_norm": 3.4375, "learning_rate": 1.531114490632847e-05, "loss": 0.3667, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17170, "tokens_per_second_per_gpu": 2337.04 }, { "epoch": 1.7155981625723986, "grad_norm": 3.640625, "learning_rate": 1.530158760795084e-05, "loss": 0.3554, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17180, "tokens_per_second_per_gpu": 2369.18 }, { "epoch": 1.7165967645296585, "grad_norm": 2.46875, "learning_rate": 1.529202858486436e-05, "loss": 0.3681, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17190, "tokens_per_second_per_gpu": 2572.19 }, { "epoch": 1.7175953664869184, "grad_norm": 3.140625, "learning_rate": 1.528246784295373e-05, "loss": 0.444, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17200, "tokens_per_second_per_gpu": 2465.08 }, { "epoch": 1.718593968444178, "grad_norm": 3.8125, "learning_rate": 1.5272905388104724e-05, "loss": 0.4044, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17210, "tokens_per_second_per_gpu": 2517.62 }, { "epoch": 1.719592570401438, "grad_norm": 3.546875, "learning_rate": 1.5263341226204166e-05, "loss": 0.3342, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17220, "tokens_per_second_per_gpu": 2536.95 }, { "epoch": 1.7205911723586977, "grad_norm": 3.34375, "learning_rate": 1.5253775363139927e-05, "loss": 0.4, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17230, "tokens_per_second_per_gpu": 2238.12 }, { "epoch": 1.7215897743159576, "grad_norm": 3.265625, "learning_rate": 1.5244207804800931e-05, "loss": 0.3735, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17240, "tokens_per_second_per_gpu": 2523.43 }, { "epoch": 1.7225883762732175, "grad_norm": 3.6875, "learning_rate": 1.523463855707714e-05, "loss": 0.3502, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17250, "tokens_per_second_per_gpu": 2440.77 }, { "epoch": 1.7235869782304774, "grad_norm": 4.75, "learning_rate": 1.5225067625859562e-05, "loss": 0.3676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17260, "tokens_per_second_per_gpu": 2511.21 }, { "epoch": 1.7245855801877372, "grad_norm": 2.9375, "learning_rate": 1.5215495017040238e-05, "loss": 0.3521, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17270, "tokens_per_second_per_gpu": 2504.83 }, { "epoch": 1.7255841821449969, "grad_norm": 3.21875, "learning_rate": 1.5205920736512238e-05, "loss": 0.3991, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17280, "tokens_per_second_per_gpu": 2559.19 }, { "epoch": 1.726582784102257, "grad_norm": 4.03125, "learning_rate": 1.519634479016967e-05, "loss": 0.3947, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17290, "tokens_per_second_per_gpu": 2285.31 }, { "epoch": 1.7275813860595166, "grad_norm": 3.609375, "learning_rate": 1.5186767183907658e-05, "loss": 0.3302, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17300, "tokens_per_second_per_gpu": 2446.37 }, { "epoch": 1.7285799880167765, "grad_norm": 4.3125, "learning_rate": 1.5177187923622358e-05, "loss": 0.3661, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17310, "tokens_per_second_per_gpu": 2478.51 }, { "epoch": 1.7295785899740364, "grad_norm": 2.40625, "learning_rate": 1.5167607015210932e-05, "loss": 0.3551, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17320, "tokens_per_second_per_gpu": 2395.42 }, { "epoch": 1.7305771919312962, "grad_norm": 2.890625, "learning_rate": 1.5158024464571575e-05, "loss": 0.3406, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17330, "tokens_per_second_per_gpu": 2562.07 }, { "epoch": 1.731575793888556, "grad_norm": 2.28125, "learning_rate": 1.5148440277603468e-05, "loss": 0.3416, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17340, "tokens_per_second_per_gpu": 2371.24 }, { "epoch": 1.7325743958458157, "grad_norm": 3.734375, "learning_rate": 1.513885446020682e-05, "loss": 0.3918, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17350, "tokens_per_second_per_gpu": 2588.57 }, { "epoch": 1.7335729978030758, "grad_norm": 3.53125, "learning_rate": 1.512926701828283e-05, "loss": 0.3675, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17360, "tokens_per_second_per_gpu": 2232.15 }, { "epoch": 1.7345715997603355, "grad_norm": 3.34375, "learning_rate": 1.5119677957733717e-05, "loss": 0.3741, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17370, "tokens_per_second_per_gpu": 2270.82 }, { "epoch": 1.7355702017175954, "grad_norm": 3.3125, "learning_rate": 1.511008728446267e-05, "loss": 0.3669, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17380, "tokens_per_second_per_gpu": 2260.55 }, { "epoch": 1.7365688036748552, "grad_norm": 3.390625, "learning_rate": 1.510049500437389e-05, "loss": 0.385, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17390, "tokens_per_second_per_gpu": 2572.04 }, { "epoch": 1.737567405632115, "grad_norm": 3.25, "learning_rate": 1.509090112337256e-05, "loss": 0.343, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17400, "tokens_per_second_per_gpu": 2394.61 }, { "epoch": 1.738566007589375, "grad_norm": 2.90625, "learning_rate": 1.5081305647364846e-05, "loss": 0.3916, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17410, "tokens_per_second_per_gpu": 2257.72 }, { "epoch": 1.7395646095466346, "grad_norm": 3.9375, "learning_rate": 1.5071708582257907e-05, "loss": 0.3529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17420, "tokens_per_second_per_gpu": 2346.79 }, { "epoch": 1.7405632115038947, "grad_norm": 5.25, "learning_rate": 1.5062109933959865e-05, "loss": 0.4323, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17430, "tokens_per_second_per_gpu": 2243.06 }, { "epoch": 1.7415618134611544, "grad_norm": 3.84375, "learning_rate": 1.5052509708379829e-05, "loss": 0.4015, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17440, "tokens_per_second_per_gpu": 2499.24 }, { "epoch": 1.7425604154184142, "grad_norm": 4.0625, "learning_rate": 1.5042907911427872e-05, "loss": 0.3521, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17450, "tokens_per_second_per_gpu": 2310.95 }, { "epoch": 1.743559017375674, "grad_norm": 3.15625, "learning_rate": 1.503330454901504e-05, "loss": 0.3548, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17460, "tokens_per_second_per_gpu": 2432.9 }, { "epoch": 1.7445576193329337, "grad_norm": 2.828125, "learning_rate": 1.502369962705334e-05, "loss": 0.3754, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17470, "tokens_per_second_per_gpu": 2388.42 }, { "epoch": 1.7455562212901938, "grad_norm": 3.25, "learning_rate": 1.5014093151455732e-05, "loss": 0.3806, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17480, "tokens_per_second_per_gpu": 2450.92 }, { "epoch": 1.7465548232474535, "grad_norm": 2.75, "learning_rate": 1.5004485128136145e-05, "loss": 0.3987, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17490, "tokens_per_second_per_gpu": 2513.24 }, { "epoch": 1.7475534252047134, "grad_norm": 3.421875, "learning_rate": 1.499487556300945e-05, "loss": 0.3979, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17500, "tokens_per_second_per_gpu": 2247.5 }, { "epoch": 1.7485520271619732, "grad_norm": 3.0, "learning_rate": 1.4985264461991477e-05, "loss": 0.3534, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17510, "tokens_per_second_per_gpu": 2417.18 }, { "epoch": 1.749550629119233, "grad_norm": 3.265625, "learning_rate": 1.4975651830998997e-05, "loss": 0.3744, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17520, "tokens_per_second_per_gpu": 2545.88 }, { "epoch": 1.750549231076493, "grad_norm": 3.234375, "learning_rate": 1.4966037675949719e-05, "loss": 0.3817, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17530, "tokens_per_second_per_gpu": 2455.09 }, { "epoch": 1.7515478330337526, "grad_norm": 3.53125, "learning_rate": 1.4956422002762293e-05, "loss": 0.3545, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17540, "tokens_per_second_per_gpu": 2532.7 }, { "epoch": 1.7525464349910127, "grad_norm": 2.828125, "learning_rate": 1.4946804817356308e-05, "loss": 0.3963, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17550, "tokens_per_second_per_gpu": 2378.35 }, { "epoch": 1.7535450369482724, "grad_norm": 2.9375, "learning_rate": 1.4937186125652274e-05, "loss": 0.366, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17560, "tokens_per_second_per_gpu": 2497.38 }, { "epoch": 1.7545436389055322, "grad_norm": 3.859375, "learning_rate": 1.4927565933571644e-05, "loss": 0.4229, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17570, "tokens_per_second_per_gpu": 2307.21 }, { "epoch": 1.755542240862792, "grad_norm": 3.625, "learning_rate": 1.4917944247036778e-05, "loss": 0.4262, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17580, "tokens_per_second_per_gpu": 2361.39 }, { "epoch": 1.756540842820052, "grad_norm": 3.90625, "learning_rate": 1.4908321071970965e-05, "loss": 0.4157, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17590, "tokens_per_second_per_gpu": 2503.13 }, { "epoch": 1.7575394447773118, "grad_norm": 3.671875, "learning_rate": 1.489869641429841e-05, "loss": 0.4054, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17600, "tokens_per_second_per_gpu": 2399.36 }, { "epoch": 1.7585380467345715, "grad_norm": 2.609375, "learning_rate": 1.4889070279944222e-05, "loss": 0.3832, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17610, "tokens_per_second_per_gpu": 2549.11 }, { "epoch": 1.7595366486918316, "grad_norm": 3.53125, "learning_rate": 1.487944267483444e-05, "loss": 0.3869, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17620, "tokens_per_second_per_gpu": 2497.77 }, { "epoch": 1.7605352506490912, "grad_norm": 3.171875, "learning_rate": 1.4869813604895982e-05, "loss": 0.3706, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17630, "tokens_per_second_per_gpu": 2527.44 }, { "epoch": 1.761533852606351, "grad_norm": 3.875, "learning_rate": 1.4860183076056686e-05, "loss": 0.374, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17640, "tokens_per_second_per_gpu": 2320.04 }, { "epoch": 1.762532454563611, "grad_norm": 2.5625, "learning_rate": 1.4850551094245286e-05, "loss": 0.3598, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17650, "tokens_per_second_per_gpu": 2408.43 }, { "epoch": 1.7635310565208708, "grad_norm": 3.734375, "learning_rate": 1.4840917665391401e-05, "loss": 0.3814, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17660, "tokens_per_second_per_gpu": 2513.46 }, { "epoch": 1.7645296584781307, "grad_norm": 4.09375, "learning_rate": 1.4831282795425546e-05, "loss": 0.4051, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17670, "tokens_per_second_per_gpu": 2432.93 }, { "epoch": 1.7655282604353904, "grad_norm": 3.296875, "learning_rate": 1.482164649027913e-05, "loss": 0.374, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17680, "tokens_per_second_per_gpu": 2384.39 }, { "epoch": 1.7665268623926504, "grad_norm": 3.65625, "learning_rate": 1.4812008755884438e-05, "loss": 0.4091, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17690, "tokens_per_second_per_gpu": 2444.08 }, { "epoch": 1.76752546434991, "grad_norm": 2.984375, "learning_rate": 1.4802369598174634e-05, "loss": 0.3446, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17700, "tokens_per_second_per_gpu": 2530.95 }, { "epoch": 1.76852406630717, "grad_norm": 3.421875, "learning_rate": 1.4792729023083765e-05, "loss": 0.3543, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17710, "tokens_per_second_per_gpu": 2341.05 }, { "epoch": 1.7695226682644298, "grad_norm": 4.03125, "learning_rate": 1.4783087036546744e-05, "loss": 0.3558, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17720, "tokens_per_second_per_gpu": 2275.33 }, { "epoch": 1.7705212702216895, "grad_norm": 4.625, "learning_rate": 1.4773443644499352e-05, "loss": 0.3676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17730, "tokens_per_second_per_gpu": 2361.99 }, { "epoch": 1.7715198721789496, "grad_norm": 3.5, "learning_rate": 1.4763798852878244e-05, "loss": 0.3568, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17740, "tokens_per_second_per_gpu": 2428.39 }, { "epoch": 1.7725184741362092, "grad_norm": 3.203125, "learning_rate": 1.4754152667620927e-05, "loss": 0.3301, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17750, "tokens_per_second_per_gpu": 2330.25 }, { "epoch": 1.773517076093469, "grad_norm": 2.765625, "learning_rate": 1.474450509466577e-05, "loss": 0.3201, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17760, "tokens_per_second_per_gpu": 2475.08 }, { "epoch": 1.774515678050729, "grad_norm": 3.90625, "learning_rate": 1.4734856139952003e-05, "loss": 0.4158, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17770, "tokens_per_second_per_gpu": 2371.54 }, { "epoch": 1.7755142800079888, "grad_norm": 2.9375, "learning_rate": 1.472520580941969e-05, "loss": 0.3417, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17780, "tokens_per_second_per_gpu": 2321.41 }, { "epoch": 1.7765128819652487, "grad_norm": 4.40625, "learning_rate": 1.471555410900976e-05, "loss": 0.4249, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17790, "tokens_per_second_per_gpu": 2539.29 }, { "epoch": 1.7775114839225084, "grad_norm": 2.984375, "learning_rate": 1.4705901044663971e-05, "loss": 0.3861, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17800, "tokens_per_second_per_gpu": 2369.51 }, { "epoch": 1.7785100858797684, "grad_norm": 3.015625, "learning_rate": 1.4696246622324933e-05, "loss": 0.3869, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17810, "tokens_per_second_per_gpu": 2529.2 }, { "epoch": 1.779508687837028, "grad_norm": 3.9375, "learning_rate": 1.4686590847936083e-05, "loss": 0.3983, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17820, "tokens_per_second_per_gpu": 2290.19 }, { "epoch": 1.780507289794288, "grad_norm": 4.4375, "learning_rate": 1.4676933727441694e-05, "loss": 0.4587, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17830, "tokens_per_second_per_gpu": 2452.19 }, { "epoch": 1.7815058917515478, "grad_norm": 4.46875, "learning_rate": 1.4667275266786861e-05, "loss": 0.3588, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17840, "tokens_per_second_per_gpu": 2594.42 }, { "epoch": 1.7825044937088077, "grad_norm": 3.0625, "learning_rate": 1.4657615471917521e-05, "loss": 0.3946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17850, "tokens_per_second_per_gpu": 2480.87 }, { "epoch": 1.7835030956660676, "grad_norm": 3.3125, "learning_rate": 1.4647954348780416e-05, "loss": 0.3946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17860, "tokens_per_second_per_gpu": 2316.45 }, { "epoch": 1.7845016976233272, "grad_norm": 4.59375, "learning_rate": 1.4638291903323108e-05, "loss": 0.3916, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17870, "tokens_per_second_per_gpu": 2470.74 }, { "epoch": 1.7855002995805873, "grad_norm": 3.296875, "learning_rate": 1.462862814149398e-05, "loss": 0.357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17880, "tokens_per_second_per_gpu": 2419.85 }, { "epoch": 1.786498901537847, "grad_norm": 3.046875, "learning_rate": 1.461896306924222e-05, "loss": 0.3069, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17890, "tokens_per_second_per_gpu": 2447.47 }, { "epoch": 1.7874975034951068, "grad_norm": 3.40625, "learning_rate": 1.4609296692517824e-05, "loss": 0.4066, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17900, "tokens_per_second_per_gpu": 2222.38 }, { "epoch": 1.7884961054523667, "grad_norm": 2.71875, "learning_rate": 1.4599629017271594e-05, "loss": 0.3298, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17910, "tokens_per_second_per_gpu": 2479.15 }, { "epoch": 1.7894947074096266, "grad_norm": 3.53125, "learning_rate": 1.4589960049455126e-05, "loss": 0.3131, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17920, "tokens_per_second_per_gpu": 2279.97 }, { "epoch": 1.7904933093668864, "grad_norm": 2.875, "learning_rate": 1.4580289795020816e-05, "loss": 0.3719, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17930, "tokens_per_second_per_gpu": 2490.91 }, { "epoch": 1.791491911324146, "grad_norm": 2.859375, "learning_rate": 1.4570618259921848e-05, "loss": 0.3874, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17940, "tokens_per_second_per_gpu": 2260.57 }, { "epoch": 1.7924905132814062, "grad_norm": 3.90625, "learning_rate": 1.4560945450112198e-05, "loss": 0.34, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17950, "tokens_per_second_per_gpu": 2529.84 }, { "epoch": 1.7934891152386658, "grad_norm": 2.75, "learning_rate": 1.4551271371546623e-05, "loss": 0.349, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17960, "tokens_per_second_per_gpu": 2510.2 }, { "epoch": 1.7944877171959257, "grad_norm": 3.203125, "learning_rate": 1.4541596030180674e-05, "loss": 0.3401, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17970, "tokens_per_second_per_gpu": 2350.56 }, { "epoch": 1.7954863191531856, "grad_norm": 3.953125, "learning_rate": 1.4531919431970658e-05, "loss": 0.3871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17980, "tokens_per_second_per_gpu": 2462.27 }, { "epoch": 1.7964849211104452, "grad_norm": 3.5625, "learning_rate": 1.452224158287367e-05, "loss": 0.3622, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 17990, "tokens_per_second_per_gpu": 2383.49 }, { "epoch": 1.7974835230677053, "grad_norm": 3.25, "learning_rate": 1.4512562488847576e-05, "loss": 0.3328, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18000, "tokens_per_second_per_gpu": 2332.59 }, { "epoch": 1.798482125024965, "grad_norm": 3.15625, "learning_rate": 1.4502882155850995e-05, "loss": 0.3452, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18010, "tokens_per_second_per_gpu": 2597.9 }, { "epoch": 1.7994807269822248, "grad_norm": 3.8125, "learning_rate": 1.4493200589843326e-05, "loss": 0.342, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18020, "tokens_per_second_per_gpu": 2358.19 }, { "epoch": 1.8004793289394847, "grad_norm": 3.453125, "learning_rate": 1.4483517796784718e-05, "loss": 0.3729, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18030, "tokens_per_second_per_gpu": 2367.87 }, { "epoch": 1.8014779308967446, "grad_norm": 3.96875, "learning_rate": 1.4473833782636075e-05, "loss": 0.3753, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18040, "tokens_per_second_per_gpu": 2429.64 }, { "epoch": 1.8024765328540044, "grad_norm": 3.171875, "learning_rate": 1.4464148553359056e-05, "loss": 0.3823, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18050, "tokens_per_second_per_gpu": 2368.88 }, { "epoch": 1.803475134811264, "grad_norm": 3.53125, "learning_rate": 1.4454462114916065e-05, "loss": 0.3652, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18060, "tokens_per_second_per_gpu": 2438.33 }, { "epoch": 1.8044737367685242, "grad_norm": 3.53125, "learning_rate": 1.4444774473270249e-05, "loss": 0.4193, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18070, "tokens_per_second_per_gpu": 2498.69 }, { "epoch": 1.8054723387257838, "grad_norm": 3.5, "learning_rate": 1.4435085634385503e-05, "loss": 0.3654, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18080, "tokens_per_second_per_gpu": 2511.51 }, { "epoch": 1.8064709406830437, "grad_norm": 3.8125, "learning_rate": 1.4425395604226449e-05, "loss": 0.3785, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18090, "tokens_per_second_per_gpu": 2523.75 }, { "epoch": 1.8074695426403036, "grad_norm": 3.015625, "learning_rate": 1.4415704388758452e-05, "loss": 0.3805, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18100, "tokens_per_second_per_gpu": 2338.77 }, { "epoch": 1.8084681445975634, "grad_norm": 4.0, "learning_rate": 1.4406011993947608e-05, "loss": 0.4201, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18110, "tokens_per_second_per_gpu": 2290.16 }, { "epoch": 1.8094667465548233, "grad_norm": 3.59375, "learning_rate": 1.4396318425760719e-05, "loss": 0.3803, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18120, "tokens_per_second_per_gpu": 2329.67 }, { "epoch": 1.810465348512083, "grad_norm": 3.4375, "learning_rate": 1.4386623690165332e-05, "loss": 0.4021, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18130, "tokens_per_second_per_gpu": 2367.53 }, { "epoch": 1.811463950469343, "grad_norm": 3.0, "learning_rate": 1.437692779312971e-05, "loss": 0.3809, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18140, "tokens_per_second_per_gpu": 2415.91 }, { "epoch": 1.8124625524266027, "grad_norm": 4.21875, "learning_rate": 1.4367230740622814e-05, "loss": 0.3717, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18150, "tokens_per_second_per_gpu": 2358.52 }, { "epoch": 1.8134611543838626, "grad_norm": 4.09375, "learning_rate": 1.4357532538614335e-05, "loss": 0.3621, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18160, "tokens_per_second_per_gpu": 2365.65 }, { "epoch": 1.8144597563411224, "grad_norm": 3.59375, "learning_rate": 1.4347833193074667e-05, "loss": 0.403, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18170, "tokens_per_second_per_gpu": 2396.34 }, { "epoch": 1.8154583582983823, "grad_norm": 2.953125, "learning_rate": 1.4338132709974902e-05, "loss": 0.3849, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18180, "tokens_per_second_per_gpu": 2132.46 }, { "epoch": 1.8164569602556422, "grad_norm": 3.265625, "learning_rate": 1.4328431095286832e-05, "loss": 0.3841, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18190, "tokens_per_second_per_gpu": 2537.36 }, { "epoch": 1.8174555622129018, "grad_norm": 2.671875, "learning_rate": 1.4318728354982955e-05, "loss": 0.3245, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18200, "tokens_per_second_per_gpu": 2350.41 }, { "epoch": 1.818454164170162, "grad_norm": 3.796875, "learning_rate": 1.4309024495036451e-05, "loss": 0.3875, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18210, "tokens_per_second_per_gpu": 2326.13 }, { "epoch": 1.8194527661274216, "grad_norm": 3.640625, "learning_rate": 1.4299319521421201e-05, "loss": 0.4727, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18220, "tokens_per_second_per_gpu": 2320.63 }, { "epoch": 1.8204513680846814, "grad_norm": 3.953125, "learning_rate": 1.4289613440111765e-05, "loss": 0.3595, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18230, "tokens_per_second_per_gpu": 2557.69 }, { "epoch": 1.8214499700419413, "grad_norm": 2.96875, "learning_rate": 1.4279906257083381e-05, "loss": 0.3662, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18240, "tokens_per_second_per_gpu": 2515.88 }, { "epoch": 1.822448571999201, "grad_norm": 2.96875, "learning_rate": 1.4270197978311966e-05, "loss": 0.332, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18250, "tokens_per_second_per_gpu": 2506.51 }, { "epoch": 1.823447173956461, "grad_norm": 3.03125, "learning_rate": 1.4260488609774125e-05, "loss": 0.4097, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18260, "tokens_per_second_per_gpu": 2447.56 }, { "epoch": 1.8244457759137207, "grad_norm": 2.78125, "learning_rate": 1.4250778157447117e-05, "loss": 0.3151, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18270, "tokens_per_second_per_gpu": 2527.94 }, { "epoch": 1.8254443778709806, "grad_norm": 3.703125, "learning_rate": 1.4241066627308874e-05, "loss": 0.382, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18280, "tokens_per_second_per_gpu": 2522.73 }, { "epoch": 1.8264429798282404, "grad_norm": 3.359375, "learning_rate": 1.4231354025338001e-05, "loss": 0.3839, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18290, "tokens_per_second_per_gpu": 2499.12 }, { "epoch": 1.8274415817855003, "grad_norm": 3.359375, "learning_rate": 1.4221640357513746e-05, "loss": 0.3357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18300, "tokens_per_second_per_gpu": 2440.05 }, { "epoch": 1.8284401837427602, "grad_norm": 3.921875, "learning_rate": 1.4211925629816026e-05, "loss": 0.3706, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18310, "tokens_per_second_per_gpu": 2342.95 }, { "epoch": 1.8294387857000198, "grad_norm": 2.28125, "learning_rate": 1.4202209848225397e-05, "loss": 0.3789, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18320, "tokens_per_second_per_gpu": 2541.65 }, { "epoch": 1.83043738765728, "grad_norm": 4.3125, "learning_rate": 1.4192493018723085e-05, "loss": 0.3933, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18330, "tokens_per_second_per_gpu": 2246.95 }, { "epoch": 1.8314359896145396, "grad_norm": 2.828125, "learning_rate": 1.4182775147290945e-05, "loss": 0.4187, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18340, "tokens_per_second_per_gpu": 2467.48 }, { "epoch": 1.8324345915717994, "grad_norm": 3.046875, "learning_rate": 1.4173056239911475e-05, "loss": 0.3877, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18350, "tokens_per_second_per_gpu": 2403.09 }, { "epoch": 1.8334331935290593, "grad_norm": 2.765625, "learning_rate": 1.4163336302567815e-05, "loss": 0.3733, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18360, "tokens_per_second_per_gpu": 2448.16 }, { "epoch": 1.8344317954863192, "grad_norm": 3.53125, "learning_rate": 1.4153615341243739e-05, "loss": 0.3858, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18370, "tokens_per_second_per_gpu": 2405.79 }, { "epoch": 1.835430397443579, "grad_norm": 3.796875, "learning_rate": 1.4143893361923644e-05, "loss": 0.3944, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18380, "tokens_per_second_per_gpu": 2325.81 }, { "epoch": 1.8364289994008387, "grad_norm": 3.71875, "learning_rate": 1.4134170370592561e-05, "loss": 0.376, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18390, "tokens_per_second_per_gpu": 2470.26 }, { "epoch": 1.8374276013580988, "grad_norm": 5.3125, "learning_rate": 1.4124446373236146e-05, "loss": 0.3105, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18400, "tokens_per_second_per_gpu": 2416.3 }, { "epoch": 1.8384262033153584, "grad_norm": 4.40625, "learning_rate": 1.4114721375840667e-05, "loss": 0.3843, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18410, "tokens_per_second_per_gpu": 2427.52 }, { "epoch": 1.8394248052726183, "grad_norm": 2.8125, "learning_rate": 1.4104995384393015e-05, "loss": 0.4416, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18420, "tokens_per_second_per_gpu": 2430.74 }, { "epoch": 1.8404234072298782, "grad_norm": 3.8125, "learning_rate": 1.4095268404880688e-05, "loss": 0.3419, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18430, "tokens_per_second_per_gpu": 2346.33 }, { "epoch": 1.841422009187138, "grad_norm": 3.90625, "learning_rate": 1.408554044329179e-05, "loss": 0.3963, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18440, "tokens_per_second_per_gpu": 2381.47 }, { "epoch": 1.842420611144398, "grad_norm": 3.5625, "learning_rate": 1.4075811505615034e-05, "loss": 0.3895, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18450, "tokens_per_second_per_gpu": 2405.01 }, { "epoch": 1.8434192131016576, "grad_norm": 3.96875, "learning_rate": 1.4066081597839736e-05, "loss": 0.3865, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18460, "tokens_per_second_per_gpu": 2378.02 }, { "epoch": 1.8444178150589177, "grad_norm": 3.421875, "learning_rate": 1.4056350725955806e-05, "loss": 0.3532, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18470, "tokens_per_second_per_gpu": 2323.24 }, { "epoch": 1.8454164170161773, "grad_norm": 2.625, "learning_rate": 1.4046618895953741e-05, "loss": 0.4008, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18480, "tokens_per_second_per_gpu": 2335.23 }, { "epoch": 1.8464150189734372, "grad_norm": 2.796875, "learning_rate": 1.4036886113824646e-05, "loss": 0.3998, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18490, "tokens_per_second_per_gpu": 2477.95 }, { "epoch": 1.847413620930697, "grad_norm": 2.90625, "learning_rate": 1.4027152385560194e-05, "loss": 0.3606, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18500, "tokens_per_second_per_gpu": 2326.16 }, { "epoch": 1.8484122228879567, "grad_norm": 3.34375, "learning_rate": 1.4017417717152643e-05, "loss": 0.3799, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18510, "tokens_per_second_per_gpu": 2594.54 }, { "epoch": 1.8494108248452168, "grad_norm": 3.921875, "learning_rate": 1.4007682114594842e-05, "loss": 0.3457, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18520, "tokens_per_second_per_gpu": 2381.53 }, { "epoch": 1.8504094268024764, "grad_norm": 4.03125, "learning_rate": 1.39979455838802e-05, "loss": 0.4058, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18530, "tokens_per_second_per_gpu": 2414.51 }, { "epoch": 1.8514080287597365, "grad_norm": 4.25, "learning_rate": 1.3988208131002715e-05, "loss": 0.4287, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18540, "tokens_per_second_per_gpu": 2376.19 }, { "epoch": 1.8524066307169962, "grad_norm": 3.84375, "learning_rate": 1.3978469761956928e-05, "loss": 0.4282, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18550, "tokens_per_second_per_gpu": 2610.1 }, { "epoch": 1.853405232674256, "grad_norm": 4.03125, "learning_rate": 1.3968730482737969e-05, "loss": 0.4451, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18560, "tokens_per_second_per_gpu": 2470.82 }, { "epoch": 1.854403834631516, "grad_norm": 5.03125, "learning_rate": 1.3958990299341512e-05, "loss": 0.4628, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18570, "tokens_per_second_per_gpu": 2370.79 }, { "epoch": 1.8554024365887756, "grad_norm": 4.125, "learning_rate": 1.3949249217763791e-05, "loss": 0.4089, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18580, "tokens_per_second_per_gpu": 2400.06 }, { "epoch": 1.8564010385460357, "grad_norm": 2.8125, "learning_rate": 1.3939507244001601e-05, "loss": 0.3699, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18590, "tokens_per_second_per_gpu": 2596.1 }, { "epoch": 1.8573996405032953, "grad_norm": 2.984375, "learning_rate": 1.392976438405228e-05, "loss": 0.3453, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18600, "tokens_per_second_per_gpu": 2501.55 }, { "epoch": 1.8583982424605552, "grad_norm": 3.84375, "learning_rate": 1.3920020643913706e-05, "loss": 0.3694, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18610, "tokens_per_second_per_gpu": 2566.21 }, { "epoch": 1.859396844417815, "grad_norm": 3.3125, "learning_rate": 1.391027602958431e-05, "loss": 0.4045, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18620, "tokens_per_second_per_gpu": 2290.04 }, { "epoch": 1.860395446375075, "grad_norm": 3.015625, "learning_rate": 1.3900530547063053e-05, "loss": 0.3742, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18630, "tokens_per_second_per_gpu": 2438.26 }, { "epoch": 1.8613940483323348, "grad_norm": 3.1875, "learning_rate": 1.3890784202349433e-05, "loss": 0.347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18640, "tokens_per_second_per_gpu": 2451.16 }, { "epoch": 1.8623926502895944, "grad_norm": 3.28125, "learning_rate": 1.388103700144348e-05, "loss": 0.3475, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18650, "tokens_per_second_per_gpu": 2530.67 }, { "epoch": 1.8633912522468545, "grad_norm": 3.28125, "learning_rate": 1.3871288950345752e-05, "loss": 0.3698, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18660, "tokens_per_second_per_gpu": 2373.33 }, { "epoch": 1.8643898542041142, "grad_norm": 3.28125, "learning_rate": 1.3861540055057327e-05, "loss": 0.3856, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18670, "tokens_per_second_per_gpu": 2185.21 }, { "epoch": 1.865388456161374, "grad_norm": 2.640625, "learning_rate": 1.3851790321579802e-05, "loss": 0.3471, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18680, "tokens_per_second_per_gpu": 2376.95 }, { "epoch": 1.866387058118634, "grad_norm": 3.71875, "learning_rate": 1.3842039755915298e-05, "loss": 0.3425, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18690, "tokens_per_second_per_gpu": 2400.46 }, { "epoch": 1.8673856600758938, "grad_norm": 3.046875, "learning_rate": 1.3832288364066435e-05, "loss": 0.3547, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18700, "tokens_per_second_per_gpu": 2309.22 }, { "epoch": 1.8683842620331537, "grad_norm": 3.171875, "learning_rate": 1.382253615203635e-05, "loss": 0.3832, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18710, "tokens_per_second_per_gpu": 2387.77 }, { "epoch": 1.8693828639904133, "grad_norm": 3.0, "learning_rate": 1.3812783125828694e-05, "loss": 0.3339, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18720, "tokens_per_second_per_gpu": 2433.82 }, { "epoch": 1.8703814659476734, "grad_norm": 3.125, "learning_rate": 1.3803029291447597e-05, "loss": 0.3468, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18730, "tokens_per_second_per_gpu": 2242.52 }, { "epoch": 1.871380067904933, "grad_norm": 4.1875, "learning_rate": 1.3793274654897709e-05, "loss": 0.4161, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18740, "tokens_per_second_per_gpu": 2291.83 }, { "epoch": 1.872378669862193, "grad_norm": 3.515625, "learning_rate": 1.3783519222184157e-05, "loss": 0.3695, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18750, "tokens_per_second_per_gpu": 2328.45 }, { "epoch": 1.8733772718194528, "grad_norm": 3.671875, "learning_rate": 1.3773762999312564e-05, "loss": 0.4081, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18760, "tokens_per_second_per_gpu": 2417.5 }, { "epoch": 1.8743758737767124, "grad_norm": 2.796875, "learning_rate": 1.3764005992289048e-05, "loss": 0.4027, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18770, "tokens_per_second_per_gpu": 2512.9 }, { "epoch": 1.8753744757339725, "grad_norm": 3.015625, "learning_rate": 1.375424820712019e-05, "loss": 0.3513, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18780, "tokens_per_second_per_gpu": 2450.84 }, { "epoch": 1.8763730776912322, "grad_norm": 3.21875, "learning_rate": 1.3744489649813072e-05, "loss": 0.3814, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18790, "tokens_per_second_per_gpu": 2294.03 }, { "epoch": 1.8773716796484923, "grad_norm": 3.171875, "learning_rate": 1.373473032637524e-05, "loss": 0.3453, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18800, "tokens_per_second_per_gpu": 2369.03 }, { "epoch": 1.878370281605752, "grad_norm": 3.421875, "learning_rate": 1.3724970242814708e-05, "loss": 0.3339, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18810, "tokens_per_second_per_gpu": 2390.22 }, { "epoch": 1.8793688835630118, "grad_norm": 3.96875, "learning_rate": 1.3715209405139967e-05, "loss": 0.3837, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18820, "tokens_per_second_per_gpu": 2413.09 }, { "epoch": 1.8803674855202717, "grad_norm": 3.265625, "learning_rate": 1.370544781935997e-05, "loss": 0.3868, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18830, "tokens_per_second_per_gpu": 1625.31 }, { "epoch": 1.8813660874775313, "grad_norm": 3.5, "learning_rate": 1.3695685491484123e-05, "loss": 0.3723, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18840, "tokens_per_second_per_gpu": 2586.9 }, { "epoch": 1.8823646894347914, "grad_norm": 4.625, "learning_rate": 1.3685922427522296e-05, "loss": 0.3413, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18850, "tokens_per_second_per_gpu": 2267.55 }, { "epoch": 1.883363291392051, "grad_norm": 2.8125, "learning_rate": 1.3676158633484818e-05, "loss": 0.3624, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18860, "tokens_per_second_per_gpu": 2389.15 }, { "epoch": 1.884361893349311, "grad_norm": 2.9375, "learning_rate": 1.3666394115382447e-05, "loss": 0.3596, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18870, "tokens_per_second_per_gpu": 2526.62 }, { "epoch": 1.8853604953065708, "grad_norm": 3.359375, "learning_rate": 1.3656628879226414e-05, "loss": 0.3614, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18880, "tokens_per_second_per_gpu": 2243.91 }, { "epoch": 1.8863590972638307, "grad_norm": 3.046875, "learning_rate": 1.364686293102837e-05, "loss": 0.3308, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18890, "tokens_per_second_per_gpu": 2574.83 }, { "epoch": 1.8873576992210905, "grad_norm": 3.015625, "learning_rate": 1.3637096276800412e-05, "loss": 0.3971, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18900, "tokens_per_second_per_gpu": 2406.56 }, { "epoch": 1.8883563011783502, "grad_norm": 3.328125, "learning_rate": 1.3627328922555071e-05, "loss": 0.3616, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18910, "tokens_per_second_per_gpu": 2344.5 }, { "epoch": 1.8893549031356103, "grad_norm": 3.578125, "learning_rate": 1.3617560874305318e-05, "loss": 0.3575, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18920, "tokens_per_second_per_gpu": 2432.67 }, { "epoch": 1.89035350509287, "grad_norm": 4.0, "learning_rate": 1.3607792138064534e-05, "loss": 0.3932, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18930, "tokens_per_second_per_gpu": 2302.79 }, { "epoch": 1.8913521070501298, "grad_norm": 3.265625, "learning_rate": 1.359802271984654e-05, "loss": 0.3864, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18940, "tokens_per_second_per_gpu": 2463.36 }, { "epoch": 1.8923507090073897, "grad_norm": 3.125, "learning_rate": 1.3588252625665566e-05, "loss": 0.3647, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18950, "tokens_per_second_per_gpu": 2320.07 }, { "epoch": 1.8933493109646495, "grad_norm": 3.34375, "learning_rate": 1.3578481861536257e-05, "loss": 0.4094, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18960, "tokens_per_second_per_gpu": 2423.33 }, { "epoch": 1.8943479129219094, "grad_norm": 2.71875, "learning_rate": 1.356871043347368e-05, "loss": 0.3978, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18970, "tokens_per_second_per_gpu": 2531.66 }, { "epoch": 1.895346514879169, "grad_norm": 4.21875, "learning_rate": 1.3558938347493307e-05, "loss": 0.3547, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18980, "tokens_per_second_per_gpu": 2391.98 }, { "epoch": 1.8963451168364291, "grad_norm": 2.578125, "learning_rate": 1.3549165609611015e-05, "loss": 0.4097, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 18990, "tokens_per_second_per_gpu": 2495.39 }, { "epoch": 1.8973437187936888, "grad_norm": 4.0625, "learning_rate": 1.3539392225843076e-05, "loss": 0.4157, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19000, "tokens_per_second_per_gpu": 2498.32 }, { "epoch": 1.8983423207509487, "grad_norm": 3.25, "learning_rate": 1.3529618202206168e-05, "loss": 0.3746, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19010, "tokens_per_second_per_gpu": 2223.12 }, { "epoch": 1.8993409227082085, "grad_norm": 2.78125, "learning_rate": 1.3519843544717359e-05, "loss": 0.4007, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19020, "tokens_per_second_per_gpu": 2313.29 }, { "epoch": 1.9003395246654682, "grad_norm": 2.78125, "learning_rate": 1.3510068259394106e-05, "loss": 0.3333, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19030, "tokens_per_second_per_gpu": 2258.49 }, { "epoch": 1.9013381266227283, "grad_norm": 2.796875, "learning_rate": 1.3500292352254257e-05, "loss": 0.3844, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19040, "tokens_per_second_per_gpu": 2426.18 }, { "epoch": 1.902336728579988, "grad_norm": 3.09375, "learning_rate": 1.3490515829316039e-05, "loss": 0.3405, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19050, "tokens_per_second_per_gpu": 2349.57 }, { "epoch": 1.903335330537248, "grad_norm": 4.0, "learning_rate": 1.3480738696598063e-05, "loss": 0.4286, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19060, "tokens_per_second_per_gpu": 2550.55 }, { "epoch": 1.9043339324945077, "grad_norm": 2.96875, "learning_rate": 1.3470960960119309e-05, "loss": 0.3374, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19070, "tokens_per_second_per_gpu": 2378.4 }, { "epoch": 1.9053325344517675, "grad_norm": 3.171875, "learning_rate": 1.3461182625899126e-05, "loss": 0.3646, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19080, "tokens_per_second_per_gpu": 2422.24 }, { "epoch": 1.9063311364090274, "grad_norm": 3.375, "learning_rate": 1.3451403699957246e-05, "loss": 0.3786, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19090, "tokens_per_second_per_gpu": 2610.91 }, { "epoch": 1.907329738366287, "grad_norm": 3.359375, "learning_rate": 1.3441624188313748e-05, "loss": 0.3393, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19100, "tokens_per_second_per_gpu": 2341.12 }, { "epoch": 1.9083283403235471, "grad_norm": 3.015625, "learning_rate": 1.3431844096989082e-05, "loss": 0.3674, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19110, "tokens_per_second_per_gpu": 2242.32 }, { "epoch": 1.9093269422808068, "grad_norm": 3.25, "learning_rate": 1.3422063432004056e-05, "loss": 0.3236, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19120, "tokens_per_second_per_gpu": 2483.45 }, { "epoch": 1.9103255442380667, "grad_norm": 2.890625, "learning_rate": 1.3412282199379819e-05, "loss": 0.3704, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19130, "tokens_per_second_per_gpu": 2394.48 }, { "epoch": 1.9113241461953265, "grad_norm": 3.296875, "learning_rate": 1.3402500405137886e-05, "loss": 0.4118, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19140, "tokens_per_second_per_gpu": 2317.7 }, { "epoch": 1.9123227481525864, "grad_norm": 2.828125, "learning_rate": 1.3392718055300105e-05, "loss": 0.321, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19150, "tokens_per_second_per_gpu": 2328.8 }, { "epoch": 1.9133213501098463, "grad_norm": 3.265625, "learning_rate": 1.3382935155888668e-05, "loss": 0.3528, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19160, "tokens_per_second_per_gpu": 2497.69 }, { "epoch": 1.914319952067106, "grad_norm": 3.78125, "learning_rate": 1.3373151712926112e-05, "loss": 0.4085, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19170, "tokens_per_second_per_gpu": 2402.64 }, { "epoch": 1.915318554024366, "grad_norm": 4.53125, "learning_rate": 1.3363367732435306e-05, "loss": 0.446, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19180, "tokens_per_second_per_gpu": 2463.06 }, { "epoch": 1.9163171559816257, "grad_norm": 2.5, "learning_rate": 1.3353583220439444e-05, "loss": 0.3833, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19190, "tokens_per_second_per_gpu": 2384.65 }, { "epoch": 1.9173157579388855, "grad_norm": 3.140625, "learning_rate": 1.3343798182962056e-05, "loss": 0.3986, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19200, "tokens_per_second_per_gpu": 2360.93 }, { "epoch": 1.9183143598961454, "grad_norm": 3.09375, "learning_rate": 1.3334012626026987e-05, "loss": 0.3504, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19210, "tokens_per_second_per_gpu": 2487.67 }, { "epoch": 1.9193129618534053, "grad_norm": 3.671875, "learning_rate": 1.3324226555658404e-05, "loss": 0.3562, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19220, "tokens_per_second_per_gpu": 2467.01 }, { "epoch": 1.9203115638106651, "grad_norm": 3.09375, "learning_rate": 1.3314439977880799e-05, "loss": 0.3757, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19230, "tokens_per_second_per_gpu": 2485.53 }, { "epoch": 1.9213101657679248, "grad_norm": 3.078125, "learning_rate": 1.330465289871896e-05, "loss": 0.3472, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19240, "tokens_per_second_per_gpu": 2420.19 }, { "epoch": 1.9223087677251849, "grad_norm": 3.90625, "learning_rate": 1.3294865324198e-05, "loss": 0.3867, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19250, "tokens_per_second_per_gpu": 2298.93 }, { "epoch": 1.9233073696824445, "grad_norm": 4.96875, "learning_rate": 1.3285077260343331e-05, "loss": 0.4261, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19260, "tokens_per_second_per_gpu": 2265.8 }, { "epoch": 1.9243059716397044, "grad_norm": 2.265625, "learning_rate": 1.3275288713180661e-05, "loss": 0.3909, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19270, "tokens_per_second_per_gpu": 2526.39 }, { "epoch": 1.9253045735969643, "grad_norm": 3.296875, "learning_rate": 1.3265499688735999e-05, "loss": 0.3838, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19280, "tokens_per_second_per_gpu": 2363.31 }, { "epoch": 1.926303175554224, "grad_norm": 2.953125, "learning_rate": 1.3255710193035652e-05, "loss": 0.3529, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19290, "tokens_per_second_per_gpu": 2483.03 }, { "epoch": 1.927301777511484, "grad_norm": 2.65625, "learning_rate": 1.3245920232106212e-05, "loss": 0.4188, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19300, "tokens_per_second_per_gpu": 2541.76 }, { "epoch": 1.9283003794687437, "grad_norm": 3.828125, "learning_rate": 1.3236129811974556e-05, "loss": 0.4059, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19310, "tokens_per_second_per_gpu": 2557.79 }, { "epoch": 1.9292989814260038, "grad_norm": 3.0625, "learning_rate": 1.3226338938667854e-05, "loss": 0.3831, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19320, "tokens_per_second_per_gpu": 2360.33 }, { "epoch": 1.9302975833832634, "grad_norm": 3.65625, "learning_rate": 1.3216547618213542e-05, "loss": 0.4051, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19330, "tokens_per_second_per_gpu": 2374.15 }, { "epoch": 1.9312961853405233, "grad_norm": 3.765625, "learning_rate": 1.3206755856639338e-05, "loss": 0.357, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19340, "tokens_per_second_per_gpu": 2390.41 }, { "epoch": 1.9322947872977831, "grad_norm": 2.6875, "learning_rate": 1.319696365997323e-05, "loss": 0.3808, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19350, "tokens_per_second_per_gpu": 2437.58 }, { "epoch": 1.9332933892550428, "grad_norm": 3.40625, "learning_rate": 1.3187171034243476e-05, "loss": 0.437, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19360, "tokens_per_second_per_gpu": 2467.65 }, { "epoch": 1.9342919912123029, "grad_norm": 3.359375, "learning_rate": 1.3177377985478593e-05, "loss": 0.4186, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19370, "tokens_per_second_per_gpu": 2390.01 }, { "epoch": 1.9352905931695625, "grad_norm": 3.71875, "learning_rate": 1.3167584519707365e-05, "loss": 0.3701, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19380, "tokens_per_second_per_gpu": 2392.6 }, { "epoch": 1.9362891951268224, "grad_norm": 2.9375, "learning_rate": 1.3157790642958833e-05, "loss": 0.3799, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19390, "tokens_per_second_per_gpu": 2301.39 }, { "epoch": 1.9372877970840823, "grad_norm": 4.15625, "learning_rate": 1.314799636126228e-05, "loss": 0.4096, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19400, "tokens_per_second_per_gpu": 2433.55 }, { "epoch": 1.9382863990413421, "grad_norm": 3.375, "learning_rate": 1.3138201680647252e-05, "loss": 0.3609, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19410, "tokens_per_second_per_gpu": 2363.51 }, { "epoch": 1.939285000998602, "grad_norm": 3.9375, "learning_rate": 1.312840660714353e-05, "loss": 0.4079, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19420, "tokens_per_second_per_gpu": 2529.1 }, { "epoch": 1.9402836029558617, "grad_norm": 2.953125, "learning_rate": 1.3118611146781144e-05, "loss": 0.3959, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19430, "tokens_per_second_per_gpu": 2577.74 }, { "epoch": 1.9412822049131218, "grad_norm": 4.8125, "learning_rate": 1.3108815305590358e-05, "loss": 0.4253, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19440, "tokens_per_second_per_gpu": 2435.5 }, { "epoch": 1.9422808068703814, "grad_norm": 3.3125, "learning_rate": 1.3099019089601678e-05, "loss": 0.4079, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19450, "tokens_per_second_per_gpu": 2305.29 }, { "epoch": 1.9432794088276413, "grad_norm": 3.625, "learning_rate": 1.3089222504845828e-05, "loss": 0.3676, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19460, "tokens_per_second_per_gpu": 2448.48 }, { "epoch": 1.9442780107849011, "grad_norm": 3.8125, "learning_rate": 1.3079425557353763e-05, "loss": 0.3345, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19470, "tokens_per_second_per_gpu": 2555.9 }, { "epoch": 1.945276612742161, "grad_norm": 3.265625, "learning_rate": 1.3069628253156672e-05, "loss": 0.3867, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19480, "tokens_per_second_per_gpu": 2201.25 }, { "epoch": 1.9462752146994209, "grad_norm": 2.328125, "learning_rate": 1.3059830598285953e-05, "loss": 0.3546, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19490, "tokens_per_second_per_gpu": 2334.56 }, { "epoch": 1.9472738166566805, "grad_norm": 2.65625, "learning_rate": 1.3050032598773221e-05, "loss": 0.4125, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19500, "tokens_per_second_per_gpu": 2480.26 }, { "epoch": 1.9482724186139406, "grad_norm": 3.390625, "learning_rate": 1.3040234260650302e-05, "loss": 0.4182, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19510, "tokens_per_second_per_gpu": 2456.34 }, { "epoch": 1.9492710205712003, "grad_norm": 3.359375, "learning_rate": 1.3030435589949241e-05, "loss": 0.3713, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19520, "tokens_per_second_per_gpu": 2479.36 }, { "epoch": 1.9502696225284601, "grad_norm": 2.75, "learning_rate": 1.302063659270228e-05, "loss": 0.3242, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19530, "tokens_per_second_per_gpu": 2496.5 }, { "epoch": 1.95126822448572, "grad_norm": 2.984375, "learning_rate": 1.3010837274941848e-05, "loss": 0.3236, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19540, "tokens_per_second_per_gpu": 2445.66 }, { "epoch": 1.9522668264429797, "grad_norm": 3.59375, "learning_rate": 1.30010376427006e-05, "loss": 0.3843, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19550, "tokens_per_second_per_gpu": 2652.56 }, { "epoch": 1.9532654284002398, "grad_norm": 3.265625, "learning_rate": 1.2991237702011364e-05, "loss": 0.369, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19560, "tokens_per_second_per_gpu": 2453.71 }, { "epoch": 1.9542640303574994, "grad_norm": 3.6875, "learning_rate": 1.2981437458907162e-05, "loss": 0.3912, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19570, "tokens_per_second_per_gpu": 2221.45 }, { "epoch": 1.9552626323147595, "grad_norm": 4.375, "learning_rate": 1.297163691942121e-05, "loss": 0.3835, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19580, "tokens_per_second_per_gpu": 2550.85 }, { "epoch": 1.9562612342720191, "grad_norm": 3.09375, "learning_rate": 1.2961836089586896e-05, "loss": 0.3808, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19590, "tokens_per_second_per_gpu": 2335.49 }, { "epoch": 1.957259836229279, "grad_norm": 4.625, "learning_rate": 1.2952034975437787e-05, "loss": 0.3974, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19600, "tokens_per_second_per_gpu": 2406.32 }, { "epoch": 1.9582584381865389, "grad_norm": 3.53125, "learning_rate": 1.2942233583007641e-05, "loss": 0.438, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19610, "tokens_per_second_per_gpu": 2369.72 }, { "epoch": 1.9592570401437985, "grad_norm": 3.40625, "learning_rate": 1.2932431918330361e-05, "loss": 0.3863, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19620, "tokens_per_second_per_gpu": 2398.44 }, { "epoch": 1.9602556421010586, "grad_norm": 5.15625, "learning_rate": 1.292262998744004e-05, "loss": 0.3962, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19630, "tokens_per_second_per_gpu": 2325.16 }, { "epoch": 1.9612542440583183, "grad_norm": 2.75, "learning_rate": 1.291282779637093e-05, "loss": 0.3946, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19640, "tokens_per_second_per_gpu": 2312.46 }, { "epoch": 1.9622528460155781, "grad_norm": 3.421875, "learning_rate": 1.2903025351157439e-05, "loss": 0.4268, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19650, "tokens_per_second_per_gpu": 2424.32 }, { "epoch": 1.963251447972838, "grad_norm": 3.46875, "learning_rate": 1.2893222657834122e-05, "loss": 0.3717, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19660, "tokens_per_second_per_gpu": 2505.4 }, { "epoch": 1.9642500499300979, "grad_norm": 4.4375, "learning_rate": 1.288341972243571e-05, "loss": 0.3648, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19670, "tokens_per_second_per_gpu": 2468.75 }, { "epoch": 1.9652486518873578, "grad_norm": 3.296875, "learning_rate": 1.2873616550997062e-05, "loss": 0.3591, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19680, "tokens_per_second_per_gpu": 2415.91 }, { "epoch": 1.9662472538446174, "grad_norm": 2.96875, "learning_rate": 1.2863813149553197e-05, "loss": 0.3322, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19690, "tokens_per_second_per_gpu": 2432.5 }, { "epoch": 1.9672458558018775, "grad_norm": 4.09375, "learning_rate": 1.2854009524139265e-05, "loss": 0.4131, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19700, "tokens_per_second_per_gpu": 2378.15 }, { "epoch": 1.9682444577591371, "grad_norm": 3.5, "learning_rate": 1.2844205680790561e-05, "loss": 0.3607, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19710, "tokens_per_second_per_gpu": 2339.84 }, { "epoch": 1.969243059716397, "grad_norm": 3.015625, "learning_rate": 1.2834401625542508e-05, "loss": 0.4145, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19720, "tokens_per_second_per_gpu": 2402.8 }, { "epoch": 1.9702416616736569, "grad_norm": 2.75, "learning_rate": 1.2824597364430665e-05, "loss": 0.3871, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19730, "tokens_per_second_per_gpu": 2680.81 }, { "epoch": 1.9712402636309168, "grad_norm": 3.78125, "learning_rate": 1.2814792903490713e-05, "loss": 0.339, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19740, "tokens_per_second_per_gpu": 2342.42 }, { "epoch": 1.9722388655881766, "grad_norm": 3.453125, "learning_rate": 1.2804988248758463e-05, "loss": 0.2848, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19750, "tokens_per_second_per_gpu": 2346.68 }, { "epoch": 1.9732374675454363, "grad_norm": 3.921875, "learning_rate": 1.2795183406269833e-05, "loss": 0.3673, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19760, "tokens_per_second_per_gpu": 2420.6 }, { "epoch": 1.9742360695026964, "grad_norm": 3.015625, "learning_rate": 1.2785378382060875e-05, "loss": 0.32, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19770, "tokens_per_second_per_gpu": 2447.83 }, { "epoch": 1.975234671459956, "grad_norm": 3.53125, "learning_rate": 1.2775573182167735e-05, "loss": 0.4006, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19780, "tokens_per_second_per_gpu": 2475.39 }, { "epoch": 1.9762332734172159, "grad_norm": 4.71875, "learning_rate": 1.2765767812626674e-05, "loss": 0.3822, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19790, "tokens_per_second_per_gpu": 2312.09 }, { "epoch": 1.9772318753744758, "grad_norm": 3.640625, "learning_rate": 1.2755962279474063e-05, "loss": 0.3726, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19800, "tokens_per_second_per_gpu": 2384.89 }, { "epoch": 1.9782304773317354, "grad_norm": 3.453125, "learning_rate": 1.2746156588746364e-05, "loss": 0.3626, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19810, "tokens_per_second_per_gpu": 2343.65 }, { "epoch": 1.9792290792889955, "grad_norm": 3.890625, "learning_rate": 1.2736350746480139e-05, "loss": 0.3677, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19820, "tokens_per_second_per_gpu": 2454.81 }, { "epoch": 1.9802276812462551, "grad_norm": 2.59375, "learning_rate": 1.272654475871205e-05, "loss": 0.3489, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19830, "tokens_per_second_per_gpu": 2239.12 }, { "epoch": 1.9812262832035152, "grad_norm": 3.53125, "learning_rate": 1.2716738631478841e-05, "loss": 0.39, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19840, "tokens_per_second_per_gpu": 2462.45 }, { "epoch": 1.9822248851607749, "grad_norm": 3.234375, "learning_rate": 1.2706932370817348e-05, "loss": 0.3495, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19850, "tokens_per_second_per_gpu": 2306.96 }, { "epoch": 1.9832234871180348, "grad_norm": 4.4375, "learning_rate": 1.2697125982764477e-05, "loss": 0.4035, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19860, "tokens_per_second_per_gpu": 2386.43 }, { "epoch": 1.9842220890752946, "grad_norm": 2.34375, "learning_rate": 1.2687319473357229e-05, "loss": 0.3414, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19870, "tokens_per_second_per_gpu": 2185.57 }, { "epoch": 1.9852206910325543, "grad_norm": 3.46875, "learning_rate": 1.2677512848632663e-05, "loss": 0.352, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19880, "tokens_per_second_per_gpu": 2255.97 }, { "epoch": 1.9862192929898144, "grad_norm": 3.46875, "learning_rate": 1.2667706114627926e-05, "loss": 0.3283, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19890, "tokens_per_second_per_gpu": 2423.27 }, { "epoch": 1.987217894947074, "grad_norm": 4.0625, "learning_rate": 1.2657899277380222e-05, "loss": 0.3461, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19900, "tokens_per_second_per_gpu": 2496.33 }, { "epoch": 1.9882164969043339, "grad_norm": 3.359375, "learning_rate": 1.2648092342926821e-05, "loss": 0.3672, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19910, "tokens_per_second_per_gpu": 2328.33 }, { "epoch": 1.9892150988615938, "grad_norm": 3.75, "learning_rate": 1.2638285317305043e-05, "loss": 0.3772, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19920, "tokens_per_second_per_gpu": 2412.85 }, { "epoch": 1.9902137008188536, "grad_norm": 3.171875, "learning_rate": 1.2628478206552285e-05, "loss": 0.4296, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19930, "tokens_per_second_per_gpu": 2376.56 }, { "epoch": 1.9912123027761135, "grad_norm": 3.28125, "learning_rate": 1.2618671016705979e-05, "loss": 0.3966, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19940, "tokens_per_second_per_gpu": 2473.32 }, { "epoch": 1.9922109047333731, "grad_norm": 3.59375, "learning_rate": 1.2608863753803615e-05, "loss": 0.4175, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19950, "tokens_per_second_per_gpu": 2245.25 }, { "epoch": 1.9932095066906332, "grad_norm": 2.515625, "learning_rate": 1.2599056423882718e-05, "loss": 0.3791, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19960, "tokens_per_second_per_gpu": 2420.53 }, { "epoch": 1.9942081086478929, "grad_norm": 3.28125, "learning_rate": 1.2589249032980868e-05, "loss": 0.3791, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19970, "tokens_per_second_per_gpu": 2388.55 }, { "epoch": 1.9952067106051528, "grad_norm": 3.171875, "learning_rate": 1.2579441587135674e-05, "loss": 0.3347, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19980, "tokens_per_second_per_gpu": 2327.82 }, { "epoch": 1.9962053125624126, "grad_norm": 3.828125, "learning_rate": 1.2569634092384771e-05, "loss": 0.3912, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 19990, "tokens_per_second_per_gpu": 2411.73 }, { "epoch": 1.9972039145196725, "grad_norm": 3.625, "learning_rate": 1.2559826554765844e-05, "loss": 0.3875, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20000, "tokens_per_second_per_gpu": 2305.24 }, { "epoch": 1.9982025164769324, "grad_norm": 3.703125, "learning_rate": 1.2550018980316591e-05, "loss": 0.3579, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20010, "tokens_per_second_per_gpu": 2473.31 }, { "epoch": 1.999201118434192, "grad_norm": 3.40625, "learning_rate": 1.2540211375074731e-05, "loss": 0.3578, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20020, "tokens_per_second_per_gpu": 2319.59 }, { "epoch": 2.000199720391452, "grad_norm": 2.5, "learning_rate": 1.2530403745078012e-05, "loss": 0.3393, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20030, "tokens_per_second_per_gpu": 2358.26 }, { "epoch": 2.0011983223487118, "grad_norm": 4.09375, "learning_rate": 1.2520596096364188e-05, "loss": 0.2728, "memory/device_reserved (GiB)": 41.27, "memory/max_active (GiB)": 40.43, "memory/max_allocated (GiB)": 40.43, "step": 20040, "tokens_per_second_per_gpu": 2420.2 } ], "logging_steps": 10, "max_steps": 40080, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 20040, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7895156743562854e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }