{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.727387276450121, "eval_steps": 500, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011454917739371984, "grad_norm": 53.10772705078125, "learning_rate": 1.1228230980751604e-06, "loss": 5.3499, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 50 }, { "epoch": 0.022909835478743968, "grad_norm": 33.95736312866211, "learning_rate": 2.268560953253896e-06, "loss": 5.0386, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 100 }, { "epoch": 0.034364753218115954, "grad_norm": 16.349882125854492, "learning_rate": 3.414298808432631e-06, "loss": 3.9819, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 150 }, { "epoch": 0.045819670957487936, "grad_norm": 15.363502502441406, "learning_rate": 4.5600366636113664e-06, "loss": 3.164, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 200 }, { "epoch": 0.05727458869685992, "grad_norm": 17.262718200683594, "learning_rate": 5.705774518790101e-06, "loss": 2.8121, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 250 }, { "epoch": 0.06872950643623191, "grad_norm": 14.994147300720215, "learning_rate": 6.8515123739688366e-06, "loss": 2.4217, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 300 }, { "epoch": 0.08018442417560388, "grad_norm": 11.715180397033691, "learning_rate": 7.997250229147571e-06, "loss": 2.1894, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 350 }, { "epoch": 0.09163934191497587, "grad_norm": 10.08484172821045, "learning_rate": 9.142988084326307e-06, "loss": 2.1338, "memory/device_mem_reserved(gib)": 49.37, "memory/max_mem_active(gib)": 44.85, "memory/max_mem_allocated(gib)": 44.85, "step": 400 }, { "epoch": 0.10309425965434786, "grad_norm": 9.557114601135254, "learning_rate": 1.0288725939505042e-05, "loss": 2.0711, "memory/device_mem_reserved(gib)": 51.36, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 450 }, { "epoch": 0.11454917739371984, "grad_norm": 9.078670501708984, "learning_rate": 1.1434463794683776e-05, "loss": 2.0381, "memory/device_mem_reserved(gib)": 51.36, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 500 }, { "epoch": 0.12600409513309183, "grad_norm": 9.677817344665527, "learning_rate": 1.2580201649862511e-05, "loss": 2.0244, "memory/device_mem_reserved(gib)": 51.36, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 550 }, { "epoch": 0.13745901287246381, "grad_norm": 10.270977973937988, "learning_rate": 1.3725939505041247e-05, "loss": 1.9982, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 600 }, { "epoch": 0.1489139306118358, "grad_norm": 8.053842544555664, "learning_rate": 1.4871677360219982e-05, "loss": 2.0071, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 650 }, { "epoch": 0.16036884835120777, "grad_norm": 8.858375549316406, "learning_rate": 1.6017415215398718e-05, "loss": 1.9546, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 700 }, { "epoch": 0.17182376609057975, "grad_norm": 9.538141250610352, "learning_rate": 1.7163153070577455e-05, "loss": 1.9464, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 750 }, { "epoch": 0.18327868382995174, "grad_norm": 7.541695594787598, "learning_rate": 1.830889092575619e-05, "loss": 1.9085, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 800 }, { "epoch": 0.19473360156932373, "grad_norm": 7.665754318237305, "learning_rate": 1.9454628780934923e-05, "loss": 1.9153, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 850 }, { "epoch": 0.20618851930869572, "grad_norm": 9.04691219329834, "learning_rate": 2.0600366636113656e-05, "loss": 1.8734, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 900 }, { "epoch": 0.2176434370480677, "grad_norm": 7.098514080047607, "learning_rate": 2.1746104491292394e-05, "loss": 1.8809, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 950 }, { "epoch": 0.22909835478743967, "grad_norm": 7.5708513259887695, "learning_rate": 2.2891842346471127e-05, "loss": 1.8459, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1000 }, { "epoch": 0.24055327252681166, "grad_norm": 8.422965049743652, "learning_rate": 2.4037580201649865e-05, "loss": 1.8414, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1050 }, { "epoch": 0.25200819026618365, "grad_norm": 7.765232563018799, "learning_rate": 2.51833180568286e-05, "loss": 1.857, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1100 }, { "epoch": 0.2634631080055556, "grad_norm": 7.53985595703125, "learning_rate": 2.6329055912007332e-05, "loss": 1.8113, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1150 }, { "epoch": 0.27491802574492763, "grad_norm": 7.5806450843811035, "learning_rate": 2.747479376718607e-05, "loss": 1.793, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1200 }, { "epoch": 0.2863729434842996, "grad_norm": 6.706181526184082, "learning_rate": 2.8620531622364803e-05, "loss": 1.8109, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1250 }, { "epoch": 0.2978278612236716, "grad_norm": 7.132224082946777, "learning_rate": 2.976626947754354e-05, "loss": 1.7923, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1300 }, { "epoch": 0.30928277896304357, "grad_norm": 7.725433826446533, "learning_rate": 3.091200733272228e-05, "loss": 1.7504, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1350 }, { "epoch": 0.32073769670241553, "grad_norm": 7.6306843757629395, "learning_rate": 3.205774518790101e-05, "loss": 1.7802, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1400 }, { "epoch": 0.33219261444178755, "grad_norm": 7.927916049957275, "learning_rate": 3.3203483043079745e-05, "loss": 1.7454, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1450 }, { "epoch": 0.3436475321811595, "grad_norm": 7.468013286590576, "learning_rate": 3.434922089825848e-05, "loss": 1.716, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1500 }, { "epoch": 0.3551024499205315, "grad_norm": 6.887967586517334, "learning_rate": 3.549495875343721e-05, "loss": 1.7054, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1550 }, { "epoch": 0.3665573676599035, "grad_norm": 7.042320251464844, "learning_rate": 3.6640696608615946e-05, "loss": 1.716, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1600 }, { "epoch": 0.37801228539927545, "grad_norm": 7.46671199798584, "learning_rate": 3.778643446379469e-05, "loss": 1.7074, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1650 }, { "epoch": 0.38946720313864747, "grad_norm": 4.348405838012695, "learning_rate": 3.893217231897342e-05, "loss": 1.6607, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1700 }, { "epoch": 0.40092212087801943, "grad_norm": 7.3193511962890625, "learning_rate": 4.0077910174152155e-05, "loss": 1.6516, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1750 }, { "epoch": 0.41237703861739144, "grad_norm": 7.363260746002197, "learning_rate": 4.122364802933089e-05, "loss": 1.6137, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1800 }, { "epoch": 0.4238319563567634, "grad_norm": 7.189822673797607, "learning_rate": 4.236938588450963e-05, "loss": 1.5882, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1850 }, { "epoch": 0.4352868740961354, "grad_norm": 7.271198272705078, "learning_rate": 4.351512373968836e-05, "loss": 1.5759, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1900 }, { "epoch": 0.4467417918355074, "grad_norm": 10.216059684753418, "learning_rate": 4.4660861594867096e-05, "loss": 1.5663, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 1950 }, { "epoch": 0.45819670957487935, "grad_norm": 6.804873943328857, "learning_rate": 4.580659945004584e-05, "loss": 1.5447, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2000 }, { "epoch": 0.46965162731425136, "grad_norm": 7.637989044189453, "learning_rate": 4.695233730522457e-05, "loss": 1.553, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2050 }, { "epoch": 0.4811065450536233, "grad_norm": 6.641468048095703, "learning_rate": 4.80980751604033e-05, "loss": 1.5486, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2100 }, { "epoch": 0.49256146279299534, "grad_norm": 7.134258270263672, "learning_rate": 4.924381301558204e-05, "loss": 1.5126, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2150 }, { "epoch": 0.5040163805323673, "grad_norm": 6.905734062194824, "learning_rate": 5.038955087076077e-05, "loss": 1.4918, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2200 }, { "epoch": 0.5154712982717393, "grad_norm": 7.143308162689209, "learning_rate": 5.153528872593951e-05, "loss": 1.4778, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2250 }, { "epoch": 0.5269262160111112, "grad_norm": 6.968287467956543, "learning_rate": 5.268102658111824e-05, "loss": 1.431, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2300 }, { "epoch": 0.5383811337504832, "grad_norm": 7.385350704193115, "learning_rate": 5.3826764436296974e-05, "loss": 1.4638, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2350 }, { "epoch": 0.5498360514898553, "grad_norm": 6.7367095947265625, "learning_rate": 5.4972502291475714e-05, "loss": 1.4236, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2400 }, { "epoch": 0.5612909692292273, "grad_norm": 7.013253211975098, "learning_rate": 5.611824014665444e-05, "loss": 1.3933, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2450 }, { "epoch": 0.5727458869685992, "grad_norm": 7.49541711807251, "learning_rate": 5.726397800183319e-05, "loss": 1.3847, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2500 }, { "epoch": 0.5842008047079712, "grad_norm": 7.078319549560547, "learning_rate": 5.8409715857011915e-05, "loss": 1.3825, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2550 }, { "epoch": 0.5956557224473432, "grad_norm": 7.429485321044922, "learning_rate": 5.9555453712190656e-05, "loss": 1.3629, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2600 }, { "epoch": 0.6071106401867151, "grad_norm": 7.05700159072876, "learning_rate": 6.070119156736939e-05, "loss": 1.3372, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2650 }, { "epoch": 0.6185655579260871, "grad_norm": 7.29513692855835, "learning_rate": 6.184692942254812e-05, "loss": 1.3185, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2700 }, { "epoch": 0.6300204756654592, "grad_norm": 6.8477911949157715, "learning_rate": 6.299266727772686e-05, "loss": 1.3066, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2750 }, { "epoch": 0.6414753934048311, "grad_norm": 7.389026641845703, "learning_rate": 6.41384051329056e-05, "loss": 1.2829, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2800 }, { "epoch": 0.6529303111442031, "grad_norm": 6.852631568908691, "learning_rate": 6.528414298808432e-05, "loss": 1.274, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2850 }, { "epoch": 0.6643852288835751, "grad_norm": 7.158923625946045, "learning_rate": 6.642988084326306e-05, "loss": 1.2486, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2900 }, { "epoch": 0.6758401466229471, "grad_norm": 7.069329261779785, "learning_rate": 6.75756186984418e-05, "loss": 1.2517, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 2950 }, { "epoch": 0.687295064362319, "grad_norm": 6.942631721496582, "learning_rate": 6.872135655362053e-05, "loss": 1.1829, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 3000 }, { "epoch": 0.698749982101691, "grad_norm": 7.831090450286865, "learning_rate": 6.986709440879927e-05, "loss": 1.2037, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 3050 }, { "epoch": 0.710204899841063, "grad_norm": 6.641531467437744, "learning_rate": 7.101283226397801e-05, "loss": 1.1565, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 3100 }, { "epoch": 0.721659817580435, "grad_norm": 7.846933841705322, "learning_rate": 7.215857011915674e-05, "loss": 1.1647, "memory/device_mem_reserved(gib)": 51.42, "memory/max_mem_active(gib)": 46.59, "memory/max_mem_allocated(gib)": 46.59, "step": 3150 }, { "epoch": 0.733114735319807, "grad_norm": 6.905858039855957, "learning_rate": 7.330430797433548e-05, "loss": 1.1853, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3200 }, { "epoch": 0.744569653059179, "grad_norm": 7.997142314910889, "learning_rate": 7.445004582951421e-05, "loss": 1.1541, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3250 }, { "epoch": 0.7560245707985509, "grad_norm": 6.95665979385376, "learning_rate": 7.559578368469294e-05, "loss": 1.1223, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3300 }, { "epoch": 0.7674794885379229, "grad_norm": 7.185131549835205, "learning_rate": 7.674152153987169e-05, "loss": 1.1113, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3350 }, { "epoch": 0.7789344062772949, "grad_norm": 6.778895854949951, "learning_rate": 7.788725939505041e-05, "loss": 1.0765, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3400 }, { "epoch": 0.790389324016667, "grad_norm": 7.30415153503418, "learning_rate": 7.903299725022914e-05, "loss": 1.0601, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3450 }, { "epoch": 0.8018442417560389, "grad_norm": 6.911710739135742, "learning_rate": 8.017873510540789e-05, "loss": 1.0406, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3500 }, { "epoch": 0.8132991594954109, "grad_norm": 7.257194995880127, "learning_rate": 8.132447296058661e-05, "loss": 1.0284, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3550 }, { "epoch": 0.8247540772347829, "grad_norm": 8.09947395324707, "learning_rate": 8.247021081576536e-05, "loss": 1.0178, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3600 }, { "epoch": 0.8362089949741548, "grad_norm": 7.630951404571533, "learning_rate": 8.361594867094409e-05, "loss": 1.0031, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3650 }, { "epoch": 0.8476639127135268, "grad_norm": 7.508652210235596, "learning_rate": 8.476168652612283e-05, "loss": 0.9628, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3700 }, { "epoch": 0.8591188304528988, "grad_norm": 8.247767448425293, "learning_rate": 8.590742438130156e-05, "loss": 0.9669, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3750 }, { "epoch": 0.8705737481922708, "grad_norm": 7.914950370788574, "learning_rate": 8.705316223648031e-05, "loss": 0.9656, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3800 }, { "epoch": 0.8820286659316428, "grad_norm": 7.725244045257568, "learning_rate": 8.819890009165903e-05, "loss": 0.9541, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3850 }, { "epoch": 0.8934835836710148, "grad_norm": 6.968287467956543, "learning_rate": 8.934463794683778e-05, "loss": 0.9421, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3900 }, { "epoch": 0.9049385014103868, "grad_norm": 6.712941646575928, "learning_rate": 9.049037580201651e-05, "loss": 0.921, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 3950 }, { "epoch": 0.9163934191497587, "grad_norm": 6.738905429840088, "learning_rate": 9.163611365719523e-05, "loss": 0.9133, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4000 }, { "epoch": 0.9278483368891307, "grad_norm": 8.376337051391602, "learning_rate": 9.278185151237398e-05, "loss": 0.9016, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4050 }, { "epoch": 0.9393032546285027, "grad_norm": 7.274137020111084, "learning_rate": 9.392758936755271e-05, "loss": 0.8829, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4100 }, { "epoch": 0.9507581723678746, "grad_norm": 7.919043064117432, "learning_rate": 9.507332722273144e-05, "loss": 0.8555, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4150 }, { "epoch": 0.9622130901072466, "grad_norm": 6.632596015930176, "learning_rate": 9.621906507791018e-05, "loss": 0.8945, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4200 }, { "epoch": 0.9736680078466187, "grad_norm": 7.122948169708252, "learning_rate": 9.736480293308891e-05, "loss": 0.8447, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4250 }, { "epoch": 0.9851229255859907, "grad_norm": 6.747700214385986, "learning_rate": 9.851054078826765e-05, "loss": 0.8283, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4300 }, { "epoch": 0.9965778433253626, "grad_norm": 6.4440765380859375, "learning_rate": 9.965627864344639e-05, "loss": 0.8052, "memory/device_mem_reserved(gib)": 53.21, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4350 }, { "epoch": 1.0080184424175604, "grad_norm": 6.0668182373046875, "learning_rate": 9.99998041506907e-05, "loss": 0.7038, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4400 }, { "epoch": 1.0194733601569324, "grad_norm": 8.027034759521484, "learning_rate": 9.999884489246108e-05, "loss": 0.6596, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4450 }, { "epoch": 1.0309282778963043, "grad_norm": 6.251341819763184, "learning_rate": 9.999708626830618e-05, "loss": 0.6702, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4500 }, { "epoch": 1.0423831956356764, "grad_norm": 6.4562506675720215, "learning_rate": 9.999452830634232e-05, "loss": 0.6421, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4550 }, { "epoch": 1.0538381133750483, "grad_norm": 6.475341796875, "learning_rate": 9.999117104746543e-05, "loss": 0.6355, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4600 }, { "epoch": 1.0652930311144204, "grad_norm": 6.397785663604736, "learning_rate": 9.998701454535029e-05, "loss": 0.638, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4650 }, { "epoch": 1.0767479488537923, "grad_norm": 6.843462944030762, "learning_rate": 9.998205886644977e-05, "loss": 0.6332, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4700 }, { "epoch": 1.0882028665931642, "grad_norm": 6.432698726654053, "learning_rate": 9.997630408999371e-05, "loss": 0.6187, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4750 }, { "epoch": 1.0996577843325364, "grad_norm": 7.654291152954102, "learning_rate": 9.996975030798767e-05, "loss": 0.6118, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4800 }, { "epoch": 1.1111127020719083, "grad_norm": 5.9475812911987305, "learning_rate": 9.996239762521151e-05, "loss": 0.6068, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4850 }, { "epoch": 1.1225676198112802, "grad_norm": 7.744262218475342, "learning_rate": 9.995424615921757e-05, "loss": 0.6021, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4900 }, { "epoch": 1.1340225375506523, "grad_norm": 6.4447197914123535, "learning_rate": 9.9945296040329e-05, "loss": 0.6119, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 4950 }, { "epoch": 1.1454774552900242, "grad_norm": 6.5645432472229, "learning_rate": 9.993554741163749e-05, "loss": 0.5836, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5000 }, { "epoch": 1.156932373029396, "grad_norm": 6.169116020202637, "learning_rate": 9.992500042900104e-05, "loss": 0.585, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5050 }, { "epoch": 1.1683872907687682, "grad_norm": 8.242532730102539, "learning_rate": 9.991365526104154e-05, "loss": 0.5657, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5100 }, { "epoch": 1.1798422085081401, "grad_norm": 7.146617412567139, "learning_rate": 9.990151208914202e-05, "loss": 0.5808, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5150 }, { "epoch": 1.191297126247512, "grad_norm": 6.222508430480957, "learning_rate": 9.988857110744367e-05, "loss": 0.554, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5200 }, { "epoch": 1.2027520439868842, "grad_norm": 6.2183146476745605, "learning_rate": 9.987483252284291e-05, "loss": 0.549, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5250 }, { "epoch": 1.214206961726256, "grad_norm": 6.201925754547119, "learning_rate": 9.986029655498792e-05, "loss": 0.5595, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5300 }, { "epoch": 1.225661879465628, "grad_norm": 6.70211124420166, "learning_rate": 9.984496343627523e-05, "loss": 0.557, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5350 }, { "epoch": 1.2371167972050001, "grad_norm": 6.915555477142334, "learning_rate": 9.982883341184593e-05, "loss": 0.5267, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5400 }, { "epoch": 1.248571714944372, "grad_norm": 6.054184913635254, "learning_rate": 9.981190673958185e-05, "loss": 0.5359, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5450 }, { "epoch": 1.260026632683744, "grad_norm": 6.70853328704834, "learning_rate": 9.979418369010131e-05, "loss": 0.5326, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5500 }, { "epoch": 1.271481550423116, "grad_norm": 6.052192211151123, "learning_rate": 9.977566454675492e-05, "loss": 0.5156, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5550 }, { "epoch": 1.282936468162488, "grad_norm": 6.540433406829834, "learning_rate": 9.975634960562094e-05, "loss": 0.5274, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5600 }, { "epoch": 1.2943913859018599, "grad_norm": 5.683777332305908, "learning_rate": 9.973623917550065e-05, "loss": 0.5169, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5650 }, { "epoch": 1.305846303641232, "grad_norm": 5.470891952514648, "learning_rate": 9.97153335779133e-05, "loss": 0.5018, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5700 }, { "epoch": 1.317301221380604, "grad_norm": 4.297957897186279, "learning_rate": 9.969363314709107e-05, "loss": 0.4915, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5750 }, { "epoch": 1.3287561391199758, "grad_norm": 6.072817325592041, "learning_rate": 9.967113822997367e-05, "loss": 0.4886, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5800 }, { "epoch": 1.340211056859348, "grad_norm": 5.685266017913818, "learning_rate": 9.964784918620282e-05, "loss": 0.4925, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5850 }, { "epoch": 1.3516659745987198, "grad_norm": 7.324371337890625, "learning_rate": 9.962376638811648e-05, "loss": 0.4557, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5900 }, { "epoch": 1.363120892338092, "grad_norm": 5.497219085693359, "learning_rate": 9.959889022074291e-05, "loss": 0.4731, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 5950 }, { "epoch": 1.3745758100774639, "grad_norm": 5.637268543243408, "learning_rate": 9.95732210817945e-05, "loss": 0.4653, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6000 }, { "epoch": 1.3860307278168358, "grad_norm": 5.0990705490112305, "learning_rate": 9.954675938166145e-05, "loss": 0.4563, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6050 }, { "epoch": 1.397485645556208, "grad_norm": 5.403497695922852, "learning_rate": 9.951950554340515e-05, "loss": 0.4427, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6100 }, { "epoch": 1.4089405632955798, "grad_norm": 5.238762378692627, "learning_rate": 9.949146000275145e-05, "loss": 0.4517, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6150 }, { "epoch": 1.420395481034952, "grad_norm": 3.6025900840759277, "learning_rate": 9.946262320808371e-05, "loss": 0.4287, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6200 }, { "epoch": 1.4318503987743239, "grad_norm": 4.929942607879639, "learning_rate": 9.94329956204356e-05, "loss": 0.4268, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6250 }, { "epoch": 1.4433053165136958, "grad_norm": 6.123436450958252, "learning_rate": 9.940257771348375e-05, "loss": 0.4254, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6300 }, { "epoch": 1.4547602342530679, "grad_norm": 6.038297653198242, "learning_rate": 9.937136997354015e-05, "loss": 0.4089, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6350 }, { "epoch": 1.4662151519924398, "grad_norm": 5.310572147369385, "learning_rate": 9.93393728995444e-05, "loss": 0.4142, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6400 }, { "epoch": 1.4776700697318117, "grad_norm": 5.765950679779053, "learning_rate": 9.930658700305576e-05, "loss": 0.4095, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6450 }, { "epoch": 1.4891249874711838, "grad_norm": 5.236095905303955, "learning_rate": 9.927301280824489e-05, "loss": 0.4068, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6500 }, { "epoch": 1.5005799052105557, "grad_norm": 5.353938102722168, "learning_rate": 9.923865085188552e-05, "loss": 0.4121, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6550 }, { "epoch": 1.5120348229499276, "grad_norm": 6.9634881019592285, "learning_rate": 9.920350168334591e-05, "loss": 0.393, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6600 }, { "epoch": 1.5234897406892998, "grad_norm": 4.650847911834717, "learning_rate": 9.916756586457999e-05, "loss": 0.385, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6650 }, { "epoch": 1.5349446584286717, "grad_norm": 5.3702311515808105, "learning_rate": 9.91308439701184e-05, "loss": 0.3965, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6700 }, { "epoch": 1.5463995761680436, "grad_norm": 5.833876132965088, "learning_rate": 9.909333658705933e-05, "loss": 0.3859, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6750 }, { "epoch": 1.5578544939074157, "grad_norm": 4.853622913360596, "learning_rate": 9.905504431505912e-05, "loss": 0.3788, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6800 }, { "epoch": 1.5693094116467876, "grad_norm": 4.673356056213379, "learning_rate": 9.901596776632266e-05, "loss": 0.3726, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6850 }, { "epoch": 1.5807643293861595, "grad_norm": 4.708242416381836, "learning_rate": 9.897610756559361e-05, "loss": 0.3624, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6900 }, { "epoch": 1.5922192471255316, "grad_norm": 5.4483962059021, "learning_rate": 9.893546435014442e-05, "loss": 0.371, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 6950 }, { "epoch": 1.6036741648649036, "grad_norm": 5.3623223304748535, "learning_rate": 9.889403876976614e-05, "loss": 0.3574, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7000 }, { "epoch": 1.6151290826042755, "grad_norm": 4.880136013031006, "learning_rate": 9.8851831486758e-05, "loss": 0.3654, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7050 }, { "epoch": 1.6265840003436476, "grad_norm": 4.732957363128662, "learning_rate": 9.880884317591687e-05, "loss": 0.3563, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7100 }, { "epoch": 1.6380389180830195, "grad_norm": 4.353087902069092, "learning_rate": 9.876507452452646e-05, "loss": 0.3523, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7150 }, { "epoch": 1.6494938358223914, "grad_norm": 5.005238056182861, "learning_rate": 9.872052623234632e-05, "loss": 0.3402, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7200 }, { "epoch": 1.6609487535617635, "grad_norm": 4.400302410125732, "learning_rate": 9.867519901160059e-05, "loss": 0.3522, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7250 }, { "epoch": 1.6724036713011354, "grad_norm": 5.095331192016602, "learning_rate": 9.862909358696674e-05, "loss": 0.3431, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7300 }, { "epoch": 1.6838585890405073, "grad_norm": 4.416248798370361, "learning_rate": 9.858221069556395e-05, "loss": 0.3373, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7350 }, { "epoch": 1.6953135067798795, "grad_norm": 4.021190166473389, "learning_rate": 9.85345510869412e-05, "loss": 0.3298, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7400 }, { "epoch": 1.7067684245192514, "grad_norm": 5.18602180480957, "learning_rate": 9.848611552306548e-05, "loss": 0.3405, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7450 }, { "epoch": 1.7182233422586233, "grad_norm": 4.7404608726501465, "learning_rate": 9.843690477830945e-05, "loss": 0.3278, "memory/device_mem_reserved(gib)": 53.23, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7500 }, { "epoch": 1.7296782599979954, "grad_norm": 5.107292175292969, "learning_rate": 9.838691963943912e-05, "loss": 0.3351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7550 }, { "epoch": 1.7411331777373675, "grad_norm": 4.792062759399414, "learning_rate": 9.83361609056013e-05, "loss": 0.3212, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7600 }, { "epoch": 1.7525880954767392, "grad_norm": 5.694723606109619, "learning_rate": 9.82846293883108e-05, "loss": 0.3191, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7650 }, { "epoch": 1.7640430132161113, "grad_norm": 4.297928333282471, "learning_rate": 9.823232591143741e-05, "loss": 0.3096, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7700 }, { "epoch": 1.7754979309554835, "grad_norm": 4.557746887207031, "learning_rate": 9.817925131119279e-05, "loss": 0.3055, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7750 }, { "epoch": 1.7869528486948552, "grad_norm": 4.228251934051514, "learning_rate": 9.81254064361171e-05, "loss": 0.3149, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7800 }, { "epoch": 1.7984077664342273, "grad_norm": 4.910319805145264, "learning_rate": 9.807079214706538e-05, "loss": 0.3141, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7850 }, { "epoch": 1.8098626841735994, "grad_norm": 5.196345329284668, "learning_rate": 9.801540931719384e-05, "loss": 0.3035, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7900 }, { "epoch": 1.8213176019129713, "grad_norm": 4.111600875854492, "learning_rate": 9.795925883194588e-05, "loss": 0.3033, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 7950 }, { "epoch": 1.8327725196523432, "grad_norm": 4.21397590637207, "learning_rate": 9.790234158903792e-05, "loss": 0.3068, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8000 }, { "epoch": 1.8442274373917154, "grad_norm": 4.57835578918457, "learning_rate": 9.784465849844511e-05, "loss": 0.3, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8050 }, { "epoch": 1.8556823551310873, "grad_norm": 4.8795294761657715, "learning_rate": 9.778621048238664e-05, "loss": 0.2919, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8100 }, { "epoch": 1.8671372728704592, "grad_norm": 4.112079620361328, "learning_rate": 9.77269984753112e-05, "loss": 0.2866, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8150 }, { "epoch": 1.8785921906098313, "grad_norm": 5.471593856811523, "learning_rate": 9.766702342388184e-05, "loss": 0.2942, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8200 }, { "epoch": 1.8900471083492032, "grad_norm": 5.2102766036987305, "learning_rate": 9.760628628696096e-05, "loss": 0.2926, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8250 }, { "epoch": 1.901502026088575, "grad_norm": 4.992270469665527, "learning_rate": 9.754478803559498e-05, "loss": 0.2874, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8300 }, { "epoch": 1.9129569438279472, "grad_norm": 4.012945175170898, "learning_rate": 9.748252965299872e-05, "loss": 0.2774, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8350 }, { "epoch": 1.9244118615673191, "grad_norm": 4.634591102600098, "learning_rate": 9.741951213453977e-05, "loss": 0.2795, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8400 }, { "epoch": 1.935866779306691, "grad_norm": 4.384332656860352, "learning_rate": 9.735573648772257e-05, "loss": 0.2785, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8450 }, { "epoch": 1.9473216970460632, "grad_norm": 4.638082504272461, "learning_rate": 9.72912037321722e-05, "loss": 0.2803, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8500 }, { "epoch": 1.958776614785435, "grad_norm": 3.405381917953491, "learning_rate": 9.722591489961827e-05, "loss": 0.2729, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8550 }, { "epoch": 1.970231532524807, "grad_norm": 4.394991874694824, "learning_rate": 9.715987103387823e-05, "loss": 0.2751, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8600 }, { "epoch": 1.9816864502641791, "grad_norm": 5.380841255187988, "learning_rate": 9.709307319084077e-05, "loss": 0.2725, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8650 }, { "epoch": 1.993141368003551, "grad_norm": 3.7391974925994873, "learning_rate": 9.702552243844899e-05, "loss": 0.2659, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8700 }, { "epoch": 2.0045819670957488, "grad_norm": 3.6832714080810547, "learning_rate": 9.69572198566832e-05, "loss": 0.2254, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8750 }, { "epoch": 2.016036884835121, "grad_norm": 3.2387888431549072, "learning_rate": 9.68881665375438e-05, "loss": 0.1553, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8800 }, { "epoch": 2.0274918025744926, "grad_norm": 3.022691488265991, "learning_rate": 9.681836358503367e-05, "loss": 0.1662, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8850 }, { "epoch": 2.0389467203138647, "grad_norm": 3.7819292545318604, "learning_rate": 9.674781211514063e-05, "loss": 0.1651, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8900 }, { "epoch": 2.050401638053237, "grad_norm": 4.307174205780029, "learning_rate": 9.667651325581955e-05, "loss": 0.1595, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 8950 }, { "epoch": 2.0618565557926085, "grad_norm": 3.7441294193267822, "learning_rate": 9.660446814697436e-05, "loss": 0.1603, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9000 }, { "epoch": 2.0733114735319806, "grad_norm": 3.3949477672576904, "learning_rate": 9.653167794043976e-05, "loss": 0.1635, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9050 }, { "epoch": 2.0847663912713528, "grad_norm": 3.6564900875091553, "learning_rate": 9.645814379996285e-05, "loss": 0.1595, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9100 }, { "epoch": 2.0962213090107245, "grad_norm": 3.380403995513916, "learning_rate": 9.638386690118452e-05, "loss": 0.1552, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9150 }, { "epoch": 2.1076762267500966, "grad_norm": 3.9699547290802, "learning_rate": 9.630884843162063e-05, "loss": 0.1603, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9200 }, { "epoch": 2.1191311444894687, "grad_norm": 2.764639139175415, "learning_rate": 9.623308959064306e-05, "loss": 0.1587, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9250 }, { "epoch": 2.130586062228841, "grad_norm": 3.9039690494537354, "learning_rate": 9.615659158946053e-05, "loss": 0.1621, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9300 }, { "epoch": 2.1420409799682125, "grad_norm": 3.1429221630096436, "learning_rate": 9.607935565109917e-05, "loss": 0.161, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9350 }, { "epoch": 2.1534958977075846, "grad_norm": 3.3480520248413086, "learning_rate": 9.600138301038311e-05, "loss": 0.1645, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9400 }, { "epoch": 2.1649508154469568, "grad_norm": 3.3411660194396973, "learning_rate": 9.592267491391452e-05, "loss": 0.1637, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9450 }, { "epoch": 2.1764057331863285, "grad_norm": 3.773784637451172, "learning_rate": 9.584323262005393e-05, "loss": 0.1631, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9500 }, { "epoch": 2.1878606509257006, "grad_norm": 2.9222793579101562, "learning_rate": 9.576305739889991e-05, "loss": 0.1598, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9550 }, { "epoch": 2.1993155686650727, "grad_norm": 3.034086227416992, "learning_rate": 9.568215053226888e-05, "loss": 0.1602, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9600 }, { "epoch": 2.2107704864044444, "grad_norm": 4.284358501434326, "learning_rate": 9.560051331367457e-05, "loss": 0.1624, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9650 }, { "epoch": 2.2222254041438165, "grad_norm": 4.235621929168701, "learning_rate": 9.551814704830734e-05, "loss": 0.1593, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9700 }, { "epoch": 2.2336803218831887, "grad_norm": 3.487086057662964, "learning_rate": 9.543505305301334e-05, "loss": 0.155, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9750 }, { "epoch": 2.2451352396225603, "grad_norm": 3.9365508556365967, "learning_rate": 9.535123265627343e-05, "loss": 0.1608, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9800 }, { "epoch": 2.2565901573619325, "grad_norm": 4.065316200256348, "learning_rate": 9.526668719818195e-05, "loss": 0.1623, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9850 }, { "epoch": 2.2680450751013046, "grad_norm": 3.1943957805633545, "learning_rate": 9.518141803042527e-05, "loss": 0.1646, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9900 }, { "epoch": 2.2794999928406763, "grad_norm": 3.362541913986206, "learning_rate": 9.509542651626027e-05, "loss": 0.1591, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 9950 }, { "epoch": 2.2909549105800484, "grad_norm": 3.442073345184326, "learning_rate": 9.500871403049239e-05, "loss": 0.1604, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10000 }, { "epoch": 2.3024098283194205, "grad_norm": 3.4276912212371826, "learning_rate": 9.492128195945383e-05, "loss": 0.1571, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10050 }, { "epoch": 2.313864746058792, "grad_norm": 2.761948347091675, "learning_rate": 9.483313170098121e-05, "loss": 0.1535, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10100 }, { "epoch": 2.3253196637981643, "grad_norm": 3.1246402263641357, "learning_rate": 9.474426466439337e-05, "loss": 0.1579, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10150 }, { "epoch": 2.3367745815375365, "grad_norm": 3.328728437423706, "learning_rate": 9.465468227046876e-05, "loss": 0.1567, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10200 }, { "epoch": 2.348229499276908, "grad_norm": 4.195374965667725, "learning_rate": 9.456438595142272e-05, "loss": 0.1542, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10250 }, { "epoch": 2.3596844170162803, "grad_norm": 3.6173229217529297, "learning_rate": 9.447337715088461e-05, "loss": 0.1615, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10300 }, { "epoch": 2.3711393347556524, "grad_norm": 3.0115489959716797, "learning_rate": 9.438165732387472e-05, "loss": 0.1586, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10350 }, { "epoch": 2.382594252495024, "grad_norm": 4.064676284790039, "learning_rate": 9.428922793678101e-05, "loss": 0.1551, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10400 }, { "epoch": 2.3940491702343962, "grad_norm": 3.5949292182922363, "learning_rate": 9.419609046733571e-05, "loss": 0.1502, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10450 }, { "epoch": 2.4055040879737684, "grad_norm": 3.932413101196289, "learning_rate": 9.410224640459156e-05, "loss": 0.157, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10500 }, { "epoch": 2.41695900571314, "grad_norm": 3.8124208450317383, "learning_rate": 9.400769724889817e-05, "loss": 0.1495, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10550 }, { "epoch": 2.428413923452512, "grad_norm": 3.310115098953247, "learning_rate": 9.391244451187793e-05, "loss": 0.1572, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10600 }, { "epoch": 2.4398688411918843, "grad_norm": 3.140340566635132, "learning_rate": 9.381648971640184e-05, "loss": 0.1544, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10650 }, { "epoch": 2.451323758931256, "grad_norm": 3.2607996463775635, "learning_rate": 9.371983439656524e-05, "loss": 0.1515, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10700 }, { "epoch": 2.462778676670628, "grad_norm": 3.3957531452178955, "learning_rate": 9.362248009766321e-05, "loss": 0.1506, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10750 }, { "epoch": 2.4742335944100002, "grad_norm": 3.6932249069213867, "learning_rate": 9.35244283761659e-05, "loss": 0.1417, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10800 }, { "epoch": 2.4856885121493724, "grad_norm": 2.407801389694214, "learning_rate": 9.342568079969363e-05, "loss": 0.1507, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10850 }, { "epoch": 2.497143429888744, "grad_norm": 3.5010054111480713, "learning_rate": 9.33262389469918e-05, "loss": 0.1486, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10900 }, { "epoch": 2.508598347628116, "grad_norm": 3.2884604930877686, "learning_rate": 9.322610440790572e-05, "loss": 0.1545, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 10950 }, { "epoch": 2.520053265367488, "grad_norm": 3.1958744525909424, "learning_rate": 9.312527878335518e-05, "loss": 0.1431, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11000 }, { "epoch": 2.53150818310686, "grad_norm": 3.1914916038513184, "learning_rate": 9.302376368530874e-05, "loss": 0.147, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11050 }, { "epoch": 2.542963100846232, "grad_norm": 2.7763078212738037, "learning_rate": 9.292156073675815e-05, "loss": 0.1471, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11100 }, { "epoch": 2.5544180185856042, "grad_norm": 3.8447723388671875, "learning_rate": 9.281867157169221e-05, "loss": 0.1463, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11150 }, { "epoch": 2.565872936324976, "grad_norm": 3.5225303173065186, "learning_rate": 9.27150978350708e-05, "loss": 0.1462, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11200 }, { "epoch": 2.577327854064348, "grad_norm": 3.2575135231018066, "learning_rate": 9.261084118279847e-05, "loss": 0.139, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11250 }, { "epoch": 2.5887827718037197, "grad_norm": 3.11187481880188, "learning_rate": 9.250590328169807e-05, "loss": 0.1423, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11300 }, { "epoch": 2.600237689543092, "grad_norm": 3.156135082244873, "learning_rate": 9.240028580948395e-05, "loss": 0.1426, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11350 }, { "epoch": 2.611692607282464, "grad_norm": 3.4446299076080322, "learning_rate": 9.229399045473532e-05, "loss": 0.1459, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11400 }, { "epoch": 2.623147525021836, "grad_norm": 3.1665008068084717, "learning_rate": 9.218701891686916e-05, "loss": 0.1489, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11450 }, { "epoch": 2.634602442761208, "grad_norm": 2.7036280632019043, "learning_rate": 9.207937290611298e-05, "loss": 0.1407, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11500 }, { "epoch": 2.64605736050058, "grad_norm": 3.9781899452209473, "learning_rate": 9.197105414347762e-05, "loss": 0.1476, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11550 }, { "epoch": 2.6575122782399516, "grad_norm": 2.9390923976898193, "learning_rate": 9.186206436072965e-05, "loss": 0.1369, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11600 }, { "epoch": 2.6689671959793237, "grad_norm": 1.9289586544036865, "learning_rate": 9.175240530036369e-05, "loss": 0.1363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11650 }, { "epoch": 2.680422113718696, "grad_norm": 3.644439697265625, "learning_rate": 9.164207871557456e-05, "loss": 0.1415, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11700 }, { "epoch": 2.691877031458068, "grad_norm": 3.1818296909332275, "learning_rate": 9.153108637022928e-05, "loss": 0.1371, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11750 }, { "epoch": 2.7033319491974397, "grad_norm": 2.6996982097625732, "learning_rate": 9.14194300388388e-05, "loss": 0.1409, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11800 }, { "epoch": 2.714786866936812, "grad_norm": 3.8771860599517822, "learning_rate": 9.13071115065297e-05, "loss": 0.1395, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11850 }, { "epoch": 2.726241784676184, "grad_norm": 3.087873935699463, "learning_rate": 9.119413256901563e-05, "loss": 0.1374, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11900 }, { "epoch": 2.7376967024155556, "grad_norm": 3.33695650100708, "learning_rate": 9.108049503256854e-05, "loss": 0.1378, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 11950 }, { "epoch": 2.7491516201549278, "grad_norm": 3.057760715484619, "learning_rate": 9.096620071398994e-05, "loss": 0.1417, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12000 }, { "epoch": 2.7606065378943, "grad_norm": 4.001928329467773, "learning_rate": 9.085125144058168e-05, "loss": 0.1405, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12050 }, { "epoch": 2.7720614556336716, "grad_norm": 2.8355178833007812, "learning_rate": 9.073564905011689e-05, "loss": 0.1426, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12100 }, { "epoch": 2.7835163733730437, "grad_norm": 3.0020503997802734, "learning_rate": 9.061939539081049e-05, "loss": 0.1386, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12150 }, { "epoch": 2.794971291112416, "grad_norm": 4.463298797607422, "learning_rate": 9.05024923212897e-05, "loss": 0.1368, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12200 }, { "epoch": 2.8064262088517875, "grad_norm": 3.095207929611206, "learning_rate": 9.03849417105643e-05, "loss": 0.139, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12250 }, { "epoch": 2.8178811265911596, "grad_norm": 3.377472162246704, "learning_rate": 9.026674543799676e-05, "loss": 0.1356, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12300 }, { "epoch": 2.8293360443305318, "grad_norm": 3.876528739929199, "learning_rate": 9.01479053932722e-05, "loss": 0.1356, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12350 }, { "epoch": 2.840790962069904, "grad_norm": 2.9100306034088135, "learning_rate": 9.002842347636815e-05, "loss": 0.1353, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12400 }, { "epoch": 2.8522458798092756, "grad_norm": 2.7643377780914307, "learning_rate": 8.990830159752422e-05, "loss": 0.1338, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12450 }, { "epoch": 2.8637007975486477, "grad_norm": 2.872948169708252, "learning_rate": 8.978754167721151e-05, "loss": 0.1352, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12500 }, { "epoch": 2.8751557152880194, "grad_norm": 3.3348748683929443, "learning_rate": 8.96661456461019e-05, "loss": 0.1337, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12550 }, { "epoch": 2.8866106330273915, "grad_norm": 2.863382577896118, "learning_rate": 8.954411544503729e-05, "loss": 0.1291, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12600 }, { "epoch": 2.8980655507667636, "grad_norm": 3.632277250289917, "learning_rate": 8.94214530249984e-05, "loss": 0.1325, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12650 }, { "epoch": 2.9095204685061358, "grad_norm": 3.788857936859131, "learning_rate": 8.929816034707375e-05, "loss": 0.1331, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12700 }, { "epoch": 2.9209753862455075, "grad_norm": 2.73443865776062, "learning_rate": 8.917423938242814e-05, "loss": 0.1322, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12750 }, { "epoch": 2.9324303039848796, "grad_norm": 3.1101582050323486, "learning_rate": 8.904969211227134e-05, "loss": 0.1274, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12800 }, { "epoch": 2.9438852217242513, "grad_norm": 2.1412153244018555, "learning_rate": 8.892452052782616e-05, "loss": 0.1363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12850 }, { "epoch": 2.9553401394636234, "grad_norm": 2.4939417839050293, "learning_rate": 8.879872663029689e-05, "loss": 0.1317, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12900 }, { "epoch": 2.9667950572029955, "grad_norm": 2.754542589187622, "learning_rate": 8.867231243083703e-05, "loss": 0.1257, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 12950 }, { "epoch": 2.9782499749423677, "grad_norm": 2.955983877182007, "learning_rate": 8.854527995051738e-05, "loss": 0.1289, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13000 }, { "epoch": 2.9897048926817393, "grad_norm": 3.313758373260498, "learning_rate": 8.841763122029358e-05, "loss": 0.1308, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13050 }, { "epoch": 3.0011454917739373, "grad_norm": 1.7117892503738403, "learning_rate": 8.828936828097368e-05, "loss": 0.1221, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13100 }, { "epoch": 3.012600409513309, "grad_norm": 3.7318451404571533, "learning_rate": 8.816049318318552e-05, "loss": 0.0704, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13150 }, { "epoch": 3.024055327252681, "grad_norm": 2.1490225791931152, "learning_rate": 8.803100798734391e-05, "loss": 0.0698, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13200 }, { "epoch": 3.0355102449920532, "grad_norm": 2.4357903003692627, "learning_rate": 8.790091476361777e-05, "loss": 0.0717, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13250 }, { "epoch": 3.046965162731425, "grad_norm": 3.2305984497070312, "learning_rate": 8.777021559189695e-05, "loss": 0.0673, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13300 }, { "epoch": 3.058420080470797, "grad_norm": 2.8263580799102783, "learning_rate": 8.763891256175902e-05, "loss": 0.069, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13350 }, { "epoch": 3.069874998210169, "grad_norm": 3.3232004642486572, "learning_rate": 8.750700777243583e-05, "loss": 0.0723, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13400 }, { "epoch": 3.0813299159495413, "grad_norm": 2.5803654193878174, "learning_rate": 8.737450333277996e-05, "loss": 0.068, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13450 }, { "epoch": 3.092784833688913, "grad_norm": 3.2602574825286865, "learning_rate": 8.724140136123106e-05, "loss": 0.0682, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13500 }, { "epoch": 3.104239751428285, "grad_norm": 3.49511456489563, "learning_rate": 8.710770398578189e-05, "loss": 0.0744, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13550 }, { "epoch": 3.1156946691676572, "grad_norm": 3.492642879486084, "learning_rate": 8.697341334394435e-05, "loss": 0.0678, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13600 }, { "epoch": 3.127149586907029, "grad_norm": 2.680922269821167, "learning_rate": 8.683853158271532e-05, "loss": 0.0682, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13650 }, { "epoch": 3.138604504646401, "grad_norm": 2.501112699508667, "learning_rate": 8.670306085854229e-05, "loss": 0.0727, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13700 }, { "epoch": 3.150059422385773, "grad_norm": 1.7489196062088013, "learning_rate": 8.65670033372889e-05, "loss": 0.0706, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13750 }, { "epoch": 3.161514340125145, "grad_norm": 2.4260241985321045, "learning_rate": 8.643036119420033e-05, "loss": 0.0718, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13800 }, { "epoch": 3.172969257864517, "grad_norm": 3.021453380584717, "learning_rate": 8.629313661386856e-05, "loss": 0.0723, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13850 }, { "epoch": 3.184424175603889, "grad_norm": 2.5771586894989014, "learning_rate": 8.615533179019726e-05, "loss": 0.0712, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13900 }, { "epoch": 3.195879093343261, "grad_norm": 3.019286870956421, "learning_rate": 8.6016948926367e-05, "loss": 0.0705, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 13950 }, { "epoch": 3.207334011082633, "grad_norm": 2.4302775859832764, "learning_rate": 8.587799023479982e-05, "loss": 0.071, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14000 }, { "epoch": 3.218788928822005, "grad_norm": 1.8431477546691895, "learning_rate": 8.573845793712383e-05, "loss": 0.0727, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14050 }, { "epoch": 3.2302438465613768, "grad_norm": 2.839580774307251, "learning_rate": 8.559835426413794e-05, "loss": 0.0739, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14100 }, { "epoch": 3.241698764300749, "grad_norm": 3.9472312927246094, "learning_rate": 8.545768145577589e-05, "loss": 0.0689, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14150 }, { "epoch": 3.253153682040121, "grad_norm": 2.908961296081543, "learning_rate": 8.531644176107066e-05, "loss": 0.0701, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14200 }, { "epoch": 3.2646085997794927, "grad_norm": 1.9942492246627808, "learning_rate": 8.517463743811836e-05, "loss": 0.0708, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14250 }, { "epoch": 3.276063517518865, "grad_norm": 2.883118152618408, "learning_rate": 8.503227075404227e-05, "loss": 0.0751, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14300 }, { "epoch": 3.287518435258237, "grad_norm": 2.3924851417541504, "learning_rate": 8.488934398495649e-05, "loss": 0.0725, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14350 }, { "epoch": 3.2989733529976086, "grad_norm": 2.108149766921997, "learning_rate": 8.474585941592959e-05, "loss": 0.0754, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14400 }, { "epoch": 3.3104282707369808, "grad_norm": 1.8208028078079224, "learning_rate": 8.460181934094809e-05, "loss": 0.0713, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14450 }, { "epoch": 3.321883188476353, "grad_norm": 2.987584114074707, "learning_rate": 8.445722606287971e-05, "loss": 0.0727, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14500 }, { "epoch": 3.3333381062157246, "grad_norm": 3.576843023300171, "learning_rate": 8.43120818934367e-05, "loss": 0.0692, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14550 }, { "epoch": 3.3447930239550967, "grad_norm": 1.5616097450256348, "learning_rate": 8.416638915313868e-05, "loss": 0.071, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14600 }, { "epoch": 3.356247941694469, "grad_norm": 2.461344003677368, "learning_rate": 8.402015017127571e-05, "loss": 0.0728, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14650 }, { "epoch": 3.3677028594338405, "grad_norm": 2.740246534347534, "learning_rate": 8.387336728587103e-05, "loss": 0.0738, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14700 }, { "epoch": 3.3791577771732126, "grad_norm": 2.1253201961517334, "learning_rate": 8.372604284364355e-05, "loss": 0.0721, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14750 }, { "epoch": 3.3906126949125848, "grad_norm": 2.5474374294281006, "learning_rate": 8.357817919997049e-05, "loss": 0.0701, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14800 }, { "epoch": 3.4020676126519565, "grad_norm": 1.9206650257110596, "learning_rate": 8.34297787188496e-05, "loss": 0.0721, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14850 }, { "epoch": 3.4135225303913286, "grad_norm": 2.298408031463623, "learning_rate": 8.328084377286149e-05, "loss": 0.0719, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14900 }, { "epoch": 3.4249774481307007, "grad_norm": 2.9477977752685547, "learning_rate": 8.313137674313158e-05, "loss": 0.0724, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 14950 }, { "epoch": 3.436432365870073, "grad_norm": 2.4904532432556152, "learning_rate": 8.298138001929206e-05, "loss": 0.0726, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15000 }, { "epoch": 3.4478872836094445, "grad_norm": 2.2400805950164795, "learning_rate": 8.283085599944376e-05, "loss": 0.0713, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15050 }, { "epoch": 3.4593422013488166, "grad_norm": 2.3121421337127686, "learning_rate": 8.267980709011769e-05, "loss": 0.0668, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15100 }, { "epoch": 3.4707971190881883, "grad_norm": 2.701951026916504, "learning_rate": 8.25282357062367e-05, "loss": 0.0698, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15150 }, { "epoch": 3.4822520368275605, "grad_norm": 2.5985162258148193, "learning_rate": 8.237614427107672e-05, "loss": 0.0682, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15200 }, { "epoch": 3.4937069545669326, "grad_norm": 1.998067855834961, "learning_rate": 8.222353521622819e-05, "loss": 0.0716, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15250 }, { "epoch": 3.5051618723063047, "grad_norm": 2.705017328262329, "learning_rate": 8.2070410981557e-05, "loss": 0.0687, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15300 }, { "epoch": 3.5166167900456764, "grad_norm": 2.35690975189209, "learning_rate": 8.191677401516565e-05, "loss": 0.0693, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15350 }, { "epoch": 3.5280717077850485, "grad_norm": 2.5952446460723877, "learning_rate": 8.176262677335398e-05, "loss": 0.0712, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15400 }, { "epoch": 3.53952662552442, "grad_norm": 2.347503662109375, "learning_rate": 8.160797172057998e-05, "loss": 0.0724, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15450 }, { "epoch": 3.5509815432637923, "grad_norm": 2.6107993125915527, "learning_rate": 8.145281132942037e-05, "loss": 0.069, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15500 }, { "epoch": 3.5624364610031645, "grad_norm": 2.2941091060638428, "learning_rate": 8.129714808053106e-05, "loss": 0.069, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15550 }, { "epoch": 3.5738913787425366, "grad_norm": 3.4392402172088623, "learning_rate": 8.114098446260745e-05, "loss": 0.072, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15600 }, { "epoch": 3.5853462964819083, "grad_norm": 1.876505732536316, "learning_rate": 8.098432297234473e-05, "loss": 0.0694, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15650 }, { "epoch": 3.5968012142212804, "grad_norm": 1.9874284267425537, "learning_rate": 8.082716611439793e-05, "loss": 0.0685, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15700 }, { "epoch": 3.608256131960652, "grad_norm": 2.479461669921875, "learning_rate": 8.066951640134181e-05, "loss": 0.0696, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15750 }, { "epoch": 3.619711049700024, "grad_norm": 2.318502426147461, "learning_rate": 8.051137635363078e-05, "loss": 0.0712, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15800 }, { "epoch": 3.6311659674393963, "grad_norm": 2.2743539810180664, "learning_rate": 8.035274849955858e-05, "loss": 0.066, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15850 }, { "epoch": 3.6426208851787685, "grad_norm": 2.7927591800689697, "learning_rate": 8.019363537521781e-05, "loss": 0.0722, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15900 }, { "epoch": 3.65407580291814, "grad_norm": 2.3082404136657715, "learning_rate": 8.003403952445942e-05, "loss": 0.0727, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 15950 }, { "epoch": 3.6655307206575123, "grad_norm": 1.7190062999725342, "learning_rate": 7.987396349885207e-05, "loss": 0.0688, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16000 }, { "epoch": 3.6769856383968844, "grad_norm": 2.170894145965576, "learning_rate": 7.97134098576413e-05, "loss": 0.0643, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16050 }, { "epoch": 3.688440556136256, "grad_norm": 2.3685245513916016, "learning_rate": 7.955238116770859e-05, "loss": 0.0667, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16100 }, { "epoch": 3.6998954738756282, "grad_norm": 2.269733190536499, "learning_rate": 7.939088000353038e-05, "loss": 0.0653, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16150 }, { "epoch": 3.7113503916150004, "grad_norm": 2.966156005859375, "learning_rate": 7.922890894713688e-05, "loss": 0.0641, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16200 }, { "epoch": 3.722805309354372, "grad_norm": 2.5244526863098145, "learning_rate": 7.906647058807078e-05, "loss": 0.0673, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16250 }, { "epoch": 3.734260227093744, "grad_norm": 2.3612561225891113, "learning_rate": 7.890356752334585e-05, "loss": 0.0682, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16300 }, { "epoch": 3.7457151448331163, "grad_norm": 2.6866989135742188, "learning_rate": 7.874020235740544e-05, "loss": 0.0689, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16350 }, { "epoch": 3.757170062572488, "grad_norm": 2.266900062561035, "learning_rate": 7.857637770208084e-05, "loss": 0.0698, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16400 }, { "epoch": 3.76862498031186, "grad_norm": 2.235653877258301, "learning_rate": 7.841209617654949e-05, "loss": 0.0642, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16450 }, { "epoch": 3.7800798980512322, "grad_norm": 4.613194942474365, "learning_rate": 7.824736040729315e-05, "loss": 0.0646, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16500 }, { "epoch": 3.7915348157906044, "grad_norm": 1.9603101015090942, "learning_rate": 7.808217302805587e-05, "loss": 0.0686, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16550 }, { "epoch": 3.802989733529976, "grad_norm": 2.1632003784179688, "learning_rate": 7.791653667980191e-05, "loss": 0.0663, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16600 }, { "epoch": 3.814444651269348, "grad_norm": 2.5433571338653564, "learning_rate": 7.77504540106735e-05, "loss": 0.0664, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16650 }, { "epoch": 3.82589956900872, "grad_norm": 3.197382926940918, "learning_rate": 7.758392767594853e-05, "loss": 0.0679, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16700 }, { "epoch": 3.837354486748092, "grad_norm": 2.555476188659668, "learning_rate": 7.741696033799804e-05, "loss": 0.0681, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16750 }, { "epoch": 3.848809404487464, "grad_norm": 2.589463233947754, "learning_rate": 7.724955466624371e-05, "loss": 0.0677, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16800 }, { "epoch": 3.8602643222268362, "grad_norm": 2.2410428524017334, "learning_rate": 7.708171333711517e-05, "loss": 0.0688, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16850 }, { "epoch": 3.871719239966208, "grad_norm": 2.9268081188201904, "learning_rate": 7.69134390340072e-05, "loss": 0.0674, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16900 }, { "epoch": 3.88317415770558, "grad_norm": 2.1275105476379395, "learning_rate": 7.674473444723684e-05, "loss": 0.0677, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 16950 }, { "epoch": 3.8946290754449517, "grad_norm": 1.7868996858596802, "learning_rate": 7.657560227400037e-05, "loss": 0.0667, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17000 }, { "epoch": 3.906083993184324, "grad_norm": 2.705197811126709, "learning_rate": 7.640604521833015e-05, "loss": 0.0713, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17050 }, { "epoch": 3.917538910923696, "grad_norm": 1.5226702690124512, "learning_rate": 7.62360659910515e-05, "loss": 0.067, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17100 }, { "epoch": 3.928993828663068, "grad_norm": 2.7335004806518555, "learning_rate": 7.60656673097392e-05, "loss": 0.0653, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17150 }, { "epoch": 3.94044874640244, "grad_norm": 2.0359129905700684, "learning_rate": 7.589485189867422e-05, "loss": 0.067, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17200 }, { "epoch": 3.951903664141812, "grad_norm": 2.2404749393463135, "learning_rate": 7.572362248880001e-05, "loss": 0.0659, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17250 }, { "epoch": 3.9633585818811836, "grad_norm": 1.9133015871047974, "learning_rate": 7.555198181767894e-05, "loss": 0.0662, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17300 }, { "epoch": 3.9748134996205557, "grad_norm": 3.204033136367798, "learning_rate": 7.537993262944849e-05, "loss": 0.0644, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17350 }, { "epoch": 3.986268417359928, "grad_norm": 2.0416345596313477, "learning_rate": 7.520747767477734e-05, "loss": 0.0648, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17400 }, { "epoch": 3.9977233350993, "grad_norm": 2.1592066287994385, "learning_rate": 7.50346197108215e-05, "loss": 0.0629, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17450 }, { "epoch": 4.0091639341914975, "grad_norm": 2.386658191680908, "learning_rate": 7.486136150118015e-05, "loss": 0.0421, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17500 }, { "epoch": 4.02061885193087, "grad_norm": 1.3900179862976074, "learning_rate": 7.468770581585146e-05, "loss": 0.0324, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17550 }, { "epoch": 4.032073769670242, "grad_norm": 1.8588780164718628, "learning_rate": 7.451365543118831e-05, "loss": 0.0354, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17600 }, { "epoch": 4.043528687409614, "grad_norm": 1.4627822637557983, "learning_rate": 7.433921312985393e-05, "loss": 0.0328, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17650 }, { "epoch": 4.054983605148985, "grad_norm": 2.9422807693481445, "learning_rate": 7.416438170077738e-05, "loss": 0.0349, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17700 }, { "epoch": 4.066438522888357, "grad_norm": 1.9216961860656738, "learning_rate": 7.398916393910895e-05, "loss": 0.0364, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17750 }, { "epoch": 4.077893440627729, "grad_norm": 1.9999079704284668, "learning_rate": 7.381356264617557e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17800 }, { "epoch": 4.0893483583671015, "grad_norm": 1.1669881343841553, "learning_rate": 7.363758062943587e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17850 }, { "epoch": 4.100803276106474, "grad_norm": 1.4963182210922241, "learning_rate": 7.346122070243539e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17900 }, { "epoch": 4.112258193845846, "grad_norm": 2.435983419418335, "learning_rate": 7.328448568476163e-05, "loss": 0.0353, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 17950 }, { "epoch": 4.123713111585217, "grad_norm": 1.783022403717041, "learning_rate": 7.310737840199885e-05, "loss": 0.0343, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18000 }, { "epoch": 4.135168029324589, "grad_norm": 1.7959028482437134, "learning_rate": 7.292990168568302e-05, "loss": 0.0344, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18050 }, { "epoch": 4.146622947063961, "grad_norm": 1.0920823812484741, "learning_rate": 7.275205837325649e-05, "loss": 0.0352, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18100 }, { "epoch": 4.158077864803333, "grad_norm": 2.1539368629455566, "learning_rate": 7.257385130802261e-05, "loss": 0.0362, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18150 }, { "epoch": 4.1695327825427055, "grad_norm": 2.0688672065734863, "learning_rate": 7.239528333910031e-05, "loss": 0.0358, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18200 }, { "epoch": 4.180987700282078, "grad_norm": 2.0575592517852783, "learning_rate": 7.221635732137854e-05, "loss": 0.037, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18250 }, { "epoch": 4.192442618021449, "grad_norm": 2.307478189468384, "learning_rate": 7.203707611547066e-05, "loss": 0.0383, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18300 }, { "epoch": 4.203897535760821, "grad_norm": 1.4493507146835327, "learning_rate": 7.185744258766858e-05, "loss": 0.0368, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18350 }, { "epoch": 4.215352453500193, "grad_norm": 1.858702301979065, "learning_rate": 7.167745960989708e-05, "loss": 0.0371, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18400 }, { "epoch": 4.226807371239565, "grad_norm": 2.091564893722534, "learning_rate": 7.149713005966784e-05, "loss": 0.037, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18450 }, { "epoch": 4.238262288978937, "grad_norm": 1.320420503616333, "learning_rate": 7.13164568200334e-05, "loss": 0.0395, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18500 }, { "epoch": 4.2497172067183095, "grad_norm": 1.7669836282730103, "learning_rate": 7.113544277954116e-05, "loss": 0.036, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18550 }, { "epoch": 4.261172124457682, "grad_norm": 1.7692891359329224, "learning_rate": 7.095409083218705e-05, "loss": 0.0363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18600 }, { "epoch": 4.272627042197053, "grad_norm": 1.4716825485229492, "learning_rate": 7.077240387736943e-05, "loss": 0.0387, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18650 }, { "epoch": 4.284081959936425, "grad_norm": 1.9312763214111328, "learning_rate": 7.05903848198426e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18700 }, { "epoch": 4.295536877675797, "grad_norm": 1.417018175125122, "learning_rate": 7.040803656967045e-05, "loss": 0.0364, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18750 }, { "epoch": 4.306991795415169, "grad_norm": 2.400550365447998, "learning_rate": 7.022536204217989e-05, "loss": 0.0363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18800 }, { "epoch": 4.318446713154541, "grad_norm": 1.612289547920227, "learning_rate": 7.004236415791421e-05, "loss": 0.0371, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18850 }, { "epoch": 4.3299016308939136, "grad_norm": 2.4686686992645264, "learning_rate": 6.985904584258649e-05, "loss": 0.0401, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18900 }, { "epoch": 4.341356548633285, "grad_norm": 3.242429256439209, "learning_rate": 6.967541002703274e-05, "loss": 0.0353, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 18950 }, { "epoch": 4.352811466372657, "grad_norm": 2.2859609127044678, "learning_rate": 6.949145964716505e-05, "loss": 0.0365, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19000 }, { "epoch": 4.364266384112029, "grad_norm": 2.1360576152801514, "learning_rate": 6.930719764392466e-05, "loss": 0.0382, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19050 }, { "epoch": 4.375721301851401, "grad_norm": 1.6462370157241821, "learning_rate": 6.912262696323497e-05, "loss": 0.0358, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19100 }, { "epoch": 4.387176219590773, "grad_norm": 1.5075321197509766, "learning_rate": 6.893775055595442e-05, "loss": 0.0356, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19150 }, { "epoch": 4.398631137330145, "grad_norm": 1.614206075668335, "learning_rate": 6.87525713778293e-05, "loss": 0.0392, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19200 }, { "epoch": 4.410086055069517, "grad_norm": 1.9505984783172607, "learning_rate": 6.856709238944649e-05, "loss": 0.0354, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19250 }, { "epoch": 4.421540972808889, "grad_norm": 1.831098198890686, "learning_rate": 6.838131655618618e-05, "loss": 0.0355, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19300 }, { "epoch": 4.432995890548261, "grad_norm": 2.2867400646209717, "learning_rate": 6.819524684817438e-05, "loss": 0.037, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19350 }, { "epoch": 4.444450808287633, "grad_norm": 1.2839210033416748, "learning_rate": 6.800888624023553e-05, "loss": 0.0375, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19400 }, { "epoch": 4.455905726027005, "grad_norm": 1.812117099761963, "learning_rate": 6.782223771184484e-05, "loss": 0.0365, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19450 }, { "epoch": 4.467360643766377, "grad_norm": 1.3475086688995361, "learning_rate": 6.763530424708072e-05, "loss": 0.0356, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19500 }, { "epoch": 4.4788155615057486, "grad_norm": 1.6308741569519043, "learning_rate": 6.744808883457707e-05, "loss": 0.0367, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19550 }, { "epoch": 4.490270479245121, "grad_norm": 1.424625039100647, "learning_rate": 6.726059446747545e-05, "loss": 0.0384, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19600 }, { "epoch": 4.501725396984493, "grad_norm": 2.242457389831543, "learning_rate": 6.707282414337728e-05, "loss": 0.0352, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19650 }, { "epoch": 4.513180314723865, "grad_norm": 2.116205930709839, "learning_rate": 6.688478086429589e-05, "loss": 0.0374, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19700 }, { "epoch": 4.524635232463237, "grad_norm": 1.493812084197998, "learning_rate": 6.669646763660855e-05, "loss": 0.0339, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19750 }, { "epoch": 4.536090150202609, "grad_norm": 1.5812180042266846, "learning_rate": 6.650788747100832e-05, "loss": 0.0375, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19800 }, { "epoch": 4.54754506794198, "grad_norm": 1.9899191856384277, "learning_rate": 6.631904338245607e-05, "loss": 0.0373, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19850 }, { "epoch": 4.558999985681353, "grad_norm": 1.682928442955017, "learning_rate": 6.612993839013211e-05, "loss": 0.0363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19900 }, { "epoch": 4.570454903420725, "grad_norm": 1.5727615356445312, "learning_rate": 6.594057551738803e-05, "loss": 0.0368, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 19950 }, { "epoch": 4.581909821160097, "grad_norm": 1.2249151468276978, "learning_rate": 6.575095779169836e-05, "loss": 0.0374, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20000 }, { "epoch": 4.593364738899469, "grad_norm": 1.8625729084014893, "learning_rate": 6.556108824461206e-05, "loss": 0.0356, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20050 }, { "epoch": 4.604819656638841, "grad_norm": 1.3668529987335205, "learning_rate": 6.537096991170423e-05, "loss": 0.0331, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20100 }, { "epoch": 4.616274574378213, "grad_norm": 1.388374924659729, "learning_rate": 6.518060583252741e-05, "loss": 0.0355, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20150 }, { "epoch": 4.627729492117584, "grad_norm": 2.348038673400879, "learning_rate": 6.498999905056309e-05, "loss": 0.0369, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20200 }, { "epoch": 4.639184409856957, "grad_norm": 1.701794147491455, "learning_rate": 6.479915261317298e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20250 }, { "epoch": 4.650639327596329, "grad_norm": 1.3405938148498535, "learning_rate": 6.460806957155037e-05, "loss": 0.0355, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20300 }, { "epoch": 4.662094245335701, "grad_norm": 1.725538730621338, "learning_rate": 6.441675298067128e-05, "loss": 0.0348, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20350 }, { "epoch": 4.673549163075073, "grad_norm": 1.583162784576416, "learning_rate": 6.422520589924564e-05, "loss": 0.0344, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20400 }, { "epoch": 4.685004080814444, "grad_norm": 1.4338629245758057, "learning_rate": 6.403343138966841e-05, "loss": 0.0353, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20450 }, { "epoch": 4.696458998553816, "grad_norm": 2.6246755123138428, "learning_rate": 6.384143251797056e-05, "loss": 0.0363, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20500 }, { "epoch": 4.7079139162931884, "grad_norm": 2.1834542751312256, "learning_rate": 6.364921235377016e-05, "loss": 0.0343, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20550 }, { "epoch": 4.719368834032561, "grad_norm": 1.6738187074661255, "learning_rate": 6.345677397022315e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20600 }, { "epoch": 4.730823751771933, "grad_norm": 1.6495721340179443, "learning_rate": 6.326412044397438e-05, "loss": 0.0366, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20650 }, { "epoch": 4.742278669511305, "grad_norm": 1.7878650426864624, "learning_rate": 6.307125485510828e-05, "loss": 0.0338, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20700 }, { "epoch": 4.753733587250677, "grad_norm": 2.035374641418457, "learning_rate": 6.287818028709967e-05, "loss": 0.0371, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20750 }, { "epoch": 4.765188504990048, "grad_norm": 2.248223304748535, "learning_rate": 6.268489982676446e-05, "loss": 0.0374, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20800 }, { "epoch": 4.77664342272942, "grad_norm": 2.056480646133423, "learning_rate": 6.249141656421035e-05, "loss": 0.0353, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20850 }, { "epoch": 4.7880983404687925, "grad_norm": 1.4961349964141846, "learning_rate": 6.229773359278735e-05, "loss": 0.037, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20900 }, { "epoch": 4.799553258208165, "grad_norm": 1.207465410232544, "learning_rate": 6.210385400903836e-05, "loss": 0.0344, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 20950 }, { "epoch": 4.811008175947537, "grad_norm": 1.933811902999878, "learning_rate": 6.190978091264959e-05, "loss": 0.0338, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21000 }, { "epoch": 4.822463093686909, "grad_norm": 1.5286064147949219, "learning_rate": 6.171551740640115e-05, "loss": 0.033, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21050 }, { "epoch": 4.83391801142628, "grad_norm": 1.4746378660202026, "learning_rate": 6.152106659611736e-05, "loss": 0.035, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21100 }, { "epoch": 4.845372929165652, "grad_norm": 1.964225172996521, "learning_rate": 6.132643159061707e-05, "loss": 0.0336, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21150 }, { "epoch": 4.856827846905024, "grad_norm": 1.239408016204834, "learning_rate": 6.1131615501664e-05, "loss": 0.0321, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21200 }, { "epoch": 4.8682827646443965, "grad_norm": 2.219224452972412, "learning_rate": 6.093662144391695e-05, "loss": 0.0371, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21250 }, { "epoch": 4.879737682383769, "grad_norm": 1.2696152925491333, "learning_rate": 6.074145253488006e-05, "loss": 0.0338, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21300 }, { "epoch": 4.891192600123141, "grad_norm": 0.5583789944648743, "learning_rate": 6.054611189485293e-05, "loss": 0.0351, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21350 }, { "epoch": 4.902647517862512, "grad_norm": 1.4981776475906372, "learning_rate": 6.035060264688075e-05, "loss": 0.0321, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21400 }, { "epoch": 4.914102435601884, "grad_norm": 1.6405904293060303, "learning_rate": 6.0154927916704304e-05, "loss": 0.0339, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21450 }, { "epoch": 4.925557353341256, "grad_norm": 1.264320731163025, "learning_rate": 5.9959090832710155e-05, "loss": 0.0319, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21500 }, { "epoch": 4.937012271080628, "grad_norm": 2.039963722229004, "learning_rate": 5.9763094525880426e-05, "loss": 0.0344, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21550 }, { "epoch": 4.9484671888200005, "grad_norm": 1.5706747770309448, "learning_rate": 5.956694212974292e-05, "loss": 0.0334, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21600 }, { "epoch": 4.959922106559373, "grad_norm": 2.058473587036133, "learning_rate": 5.937063678032093e-05, "loss": 0.0335, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21650 }, { "epoch": 4.971377024298745, "grad_norm": 1.5394372940063477, "learning_rate": 5.9174181616083066e-05, "loss": 0.0337, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21700 }, { "epoch": 4.982831942038116, "grad_norm": 2.087599992752075, "learning_rate": 5.89775797778932e-05, "loss": 0.0341, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21750 }, { "epoch": 4.994286859777488, "grad_norm": 1.8887306451797485, "learning_rate": 5.878083440896015e-05, "loss": 0.0327, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21800 }, { "epoch": 5.005727458869686, "grad_norm": 1.0491043329238892, "learning_rate": 5.858394865478745e-05, "loss": 0.0263, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21850 }, { "epoch": 5.017182376609058, "grad_norm": 1.3754074573516846, "learning_rate": 5.8386925663123104e-05, "loss": 0.0157, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21900 }, { "epoch": 5.02863729434843, "grad_norm": 0.8756074905395508, "learning_rate": 5.818976858390918e-05, "loss": 0.0184, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 21950 }, { "epoch": 5.040092212087802, "grad_norm": 0.9743272066116333, "learning_rate": 5.7992480569231514e-05, "loss": 0.018, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22000 }, { "epoch": 5.051547129827174, "grad_norm": 1.1601800918579102, "learning_rate": 5.779506477326933e-05, "loss": 0.0177, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22050 }, { "epoch": 5.063002047566546, "grad_norm": 1.3135687112808228, "learning_rate": 5.7597524352244734e-05, "loss": 0.0191, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22100 }, { "epoch": 5.0744569653059175, "grad_norm": 1.3936012983322144, "learning_rate": 5.7399862464372324e-05, "loss": 0.0184, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22150 }, { "epoch": 5.08591188304529, "grad_norm": 0.9932096600532532, "learning_rate": 5.720208226980864e-05, "loss": 0.0186, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22200 }, { "epoch": 5.097366800784662, "grad_norm": 1.0546112060546875, "learning_rate": 5.700418693060173e-05, "loss": 0.0194, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22250 }, { "epoch": 5.108821718524034, "grad_norm": 0.8949224948883057, "learning_rate": 5.6806179610640486e-05, "loss": 0.0187, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22300 }, { "epoch": 5.120276636263406, "grad_norm": 0.9786812663078308, "learning_rate": 5.660806347560416e-05, "loss": 0.0176, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22350 }, { "epoch": 5.131731554002778, "grad_norm": 0.9927299618721008, "learning_rate": 5.6409841692911625e-05, "loss": 0.0195, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22400 }, { "epoch": 5.143186471742149, "grad_norm": 1.1186367273330688, "learning_rate": 5.621151743167091e-05, "loss": 0.0189, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22450 }, { "epoch": 5.1546413894815215, "grad_norm": 1.2558495998382568, "learning_rate": 5.60130938626284e-05, "loss": 0.0195, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22500 }, { "epoch": 5.166096307220894, "grad_norm": 1.2264378070831299, "learning_rate": 5.581457415811815e-05, "loss": 0.0198, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22550 }, { "epoch": 5.177551224960266, "grad_norm": 1.243323564529419, "learning_rate": 5.561596149201127e-05, "loss": 0.0187, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22600 }, { "epoch": 5.189006142699638, "grad_norm": 0.641426682472229, "learning_rate": 5.541725903966504e-05, "loss": 0.0183, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22650 }, { "epoch": 5.20046106043901, "grad_norm": 0.5407239198684692, "learning_rate": 5.521846997787223e-05, "loss": 0.019, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22700 }, { "epoch": 5.211915978178382, "grad_norm": 1.1588449478149414, "learning_rate": 5.501959748481035e-05, "loss": 0.0203, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22750 }, { "epoch": 5.223370895917753, "grad_norm": 1.5353953838348389, "learning_rate": 5.482064473999071e-05, "loss": 0.0197, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22800 }, { "epoch": 5.2348258136571255, "grad_norm": 1.5715053081512451, "learning_rate": 5.462161492420772e-05, "loss": 0.0205, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22850 }, { "epoch": 5.246280731396498, "grad_norm": 0.8576170206069946, "learning_rate": 5.442251121948793e-05, "loss": 0.0198, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22900 }, { "epoch": 5.25773564913587, "grad_norm": 1.722284197807312, "learning_rate": 5.422333680903921e-05, "loss": 0.0194, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 22950 }, { "epoch": 5.269190566875242, "grad_norm": 1.3785938024520874, "learning_rate": 5.4024094877199884e-05, "loss": 0.0204, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23000 }, { "epoch": 5.280645484614614, "grad_norm": 0.8565208911895752, "learning_rate": 5.382478860938776e-05, "loss": 0.0187, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23050 }, { "epoch": 5.292100402353985, "grad_norm": 1.519986629486084, "learning_rate": 5.362542119204924e-05, "loss": 0.0204, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23100 }, { "epoch": 5.303555320093357, "grad_norm": 0.8362240791320801, "learning_rate": 5.3425995812608355e-05, "loss": 0.0188, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23150 }, { "epoch": 5.3150102378327295, "grad_norm": 1.1630821228027344, "learning_rate": 5.3226515659415824e-05, "loss": 0.0193, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23200 }, { "epoch": 5.326465155572102, "grad_norm": 0.8801319599151611, "learning_rate": 5.302698392169806e-05, "loss": 0.0179, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23250 }, { "epoch": 5.337920073311474, "grad_norm": 1.0547822713851929, "learning_rate": 5.2827403789506234e-05, "loss": 0.0203, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23300 }, { "epoch": 5.349374991050846, "grad_norm": 1.2050660848617554, "learning_rate": 5.262777845366515e-05, "loss": 0.0189, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23350 }, { "epoch": 5.360829908790217, "grad_norm": 1.1393359899520874, "learning_rate": 5.242811110572242e-05, "loss": 0.0199, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23400 }, { "epoch": 5.372284826529589, "grad_norm": 0.7587368488311768, "learning_rate": 5.2228404937897235e-05, "loss": 0.0182, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23450 }, { "epoch": 5.383739744268961, "grad_norm": 1.5062503814697266, "learning_rate": 5.20286631430295e-05, "loss": 0.0202, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23500 }, { "epoch": 5.3951946620083335, "grad_norm": 0.8290795087814331, "learning_rate": 5.1828888914528674e-05, "loss": 0.0197, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23550 }, { "epoch": 5.406649579747706, "grad_norm": 1.096450924873352, "learning_rate": 5.162908544632274e-05, "loss": 0.0194, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23600 }, { "epoch": 5.418104497487078, "grad_norm": 0.8339506387710571, "learning_rate": 5.142925593280722e-05, "loss": 0.0206, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23650 }, { "epoch": 5.429559415226449, "grad_norm": 1.8221694231033325, "learning_rate": 5.1229403568793963e-05, "loss": 0.02, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23700 }, { "epoch": 5.441014332965821, "grad_norm": 2.2157158851623535, "learning_rate": 5.1029531549460205e-05, "loss": 0.0208, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23750 }, { "epoch": 5.452469250705193, "grad_norm": 1.0183664560317993, "learning_rate": 5.0829643070297415e-05, "loss": 0.0192, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23800 }, { "epoch": 5.463924168444565, "grad_norm": 0.8894338011741638, "learning_rate": 5.062974132706016e-05, "loss": 0.0188, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23850 }, { "epoch": 5.4753790861839375, "grad_norm": 1.4340115785598755, "learning_rate": 5.042982951571515e-05, "loss": 0.0188, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23900 }, { "epoch": 5.48683400392331, "grad_norm": 1.343416690826416, "learning_rate": 5.022991083239002e-05, "loss": 0.0204, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 23950 }, { "epoch": 5.498288921662681, "grad_norm": 1.0022575855255127, "learning_rate": 5.0029988473322256e-05, "loss": 0.0196, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24000 }, { "epoch": 5.509743839402053, "grad_norm": 1.0293878316879272, "learning_rate": 4.9830065634808144e-05, "loss": 0.0185, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24050 }, { "epoch": 5.521198757141425, "grad_norm": 1.5076055526733398, "learning_rate": 4.963014551315163e-05, "loss": 0.018, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24100 }, { "epoch": 5.532653674880797, "grad_norm": 1.333103895187378, "learning_rate": 4.943023130461317e-05, "loss": 0.0189, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24150 }, { "epoch": 5.544108592620169, "grad_norm": 1.848940134048462, "learning_rate": 4.9230326205358794e-05, "loss": 0.0191, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24200 }, { "epoch": 5.5555635103595415, "grad_norm": 1.3931152820587158, "learning_rate": 4.903043341140879e-05, "loss": 0.0199, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24250 }, { "epoch": 5.567018428098914, "grad_norm": 1.2779439687728882, "learning_rate": 4.883055611858676e-05, "loss": 0.0181, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24300 }, { "epoch": 5.578473345838285, "grad_norm": 1.221993088722229, "learning_rate": 4.8630697522468455e-05, "loss": 0.0201, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24350 }, { "epoch": 5.589928263577657, "grad_norm": 1.5948313474655151, "learning_rate": 4.8430860818330756e-05, "loss": 0.0192, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24400 }, { "epoch": 5.601383181317029, "grad_norm": 1.4784343242645264, "learning_rate": 4.823104920110049e-05, "loss": 0.0195, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24450 }, { "epoch": 5.612838099056401, "grad_norm": 1.4098366498947144, "learning_rate": 4.8031265865303434e-05, "loss": 0.0201, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24500 }, { "epoch": 5.624293016795773, "grad_norm": 1.5338062047958374, "learning_rate": 4.783151400501319e-05, "loss": 0.0196, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24550 }, { "epoch": 5.635747934535145, "grad_norm": 1.1229875087738037, "learning_rate": 4.763179681380016e-05, "loss": 0.0188, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24600 }, { "epoch": 5.647202852274517, "grad_norm": 1.63667893409729, "learning_rate": 4.7432117484680434e-05, "loss": 0.02, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24650 }, { "epoch": 5.658657770013889, "grad_norm": 0.6071897745132446, "learning_rate": 4.723247921006483e-05, "loss": 0.0202, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24700 }, { "epoch": 5.670112687753261, "grad_norm": 1.3240978717803955, "learning_rate": 4.703288518170774e-05, "loss": 0.019, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24750 }, { "epoch": 5.681567605492633, "grad_norm": 0.9332528114318848, "learning_rate": 4.683333859065621e-05, "loss": 0.0189, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24800 }, { "epoch": 5.693022523232005, "grad_norm": 1.2496166229248047, "learning_rate": 4.663384262719881e-05, "loss": 0.0183, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24850 }, { "epoch": 5.704477440971377, "grad_norm": 1.2651309967041016, "learning_rate": 4.643440048081478e-05, "loss": 0.0201, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24900 }, { "epoch": 5.715932358710749, "grad_norm": 1.1634583473205566, "learning_rate": 4.623501534012287e-05, "loss": 0.02, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 24950 }, { "epoch": 5.727387276450121, "grad_norm": 1.3410481214523315, "learning_rate": 4.60356903928305e-05, "loss": 0.0187, "memory/device_mem_reserved(gib)": 53.25, "memory/max_mem_active(gib)": 48.21, "memory/max_mem_allocated(gib)": 48.21, "step": 25000 } ], "logging_steps": 50, "max_steps": 43649, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.430940996497441e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }