{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0653893938403191, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000653893938403191, "grad_norm": 8.0, "learning_rate": 1e-05, "loss": 10.9235, "step": 5 }, { "epoch": 0.001307787876806382, "grad_norm": 7.3125, "learning_rate": 2e-05, "loss": 10.8446, "step": 10 }, { "epoch": 0.001961681815209573, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 10.6184, "step": 15 }, { "epoch": 0.002615575753612764, "grad_norm": 3.453125, "learning_rate": 4e-05, "loss": 10.4354, "step": 20 }, { "epoch": 0.003269469692015955, "grad_norm": 3.359375, "learning_rate": 5e-05, "loss": 10.2832, "step": 25 }, { "epoch": 0.003923363630419146, "grad_norm": 2.90625, "learning_rate": 6e-05, "loss": 10.1894, "step": 30 }, { "epoch": 0.004577257568822337, "grad_norm": 2.984375, "learning_rate": 7.000000000000001e-05, "loss": 10.0521, "step": 35 }, { "epoch": 0.005231151507225528, "grad_norm": 2.5625, "learning_rate": 8e-05, "loss": 9.9261, "step": 40 }, { "epoch": 0.005885045445628719, "grad_norm": 2.59375, "learning_rate": 8.999999999999999e-05, "loss": 9.7805, "step": 45 }, { "epoch": 0.00653893938403191, "grad_norm": 2.515625, "learning_rate": 0.0001, "loss": 9.5316, "step": 50 }, { "epoch": 0.007192833322435101, "grad_norm": 2.109375, "learning_rate": 0.00011, "loss": 9.3912, "step": 55 }, { "epoch": 0.007846727260838291, "grad_norm": 2.015625, "learning_rate": 0.00012, "loss": 9.2058, "step": 60 }, { "epoch": 0.008500621199241483, "grad_norm": 1.828125, "learning_rate": 0.00013000000000000002, "loss": 9.0564, "step": 65 }, { "epoch": 0.009154515137644674, "grad_norm": 1.703125, "learning_rate": 0.00014000000000000001, "loss": 8.8344, "step": 70 }, { "epoch": 0.009808409076047865, "grad_norm": 1.375, "learning_rate": 0.00015, "loss": 8.6926, "step": 75 }, { "epoch": 0.010462303014451056, "grad_norm": 1.3125, "learning_rate": 0.00016, "loss": 8.5817, "step": 80 }, { "epoch": 0.011116196952854247, "grad_norm": 1.171875, "learning_rate": 0.00017, "loss": 8.4945, "step": 85 }, { "epoch": 0.011770090891257438, "grad_norm": 1.0859375, "learning_rate": 0.00017999999999999998, "loss": 8.4975, "step": 90 }, { "epoch": 0.01242398482966063, "grad_norm": 1.2734375, "learning_rate": 0.00019, "loss": 8.4394, "step": 95 }, { "epoch": 0.01307787876806382, "grad_norm": 1.359375, "learning_rate": 0.0002, "loss": 8.4332, "step": 100 }, { "epoch": 0.013731772706467011, "grad_norm": 1.9375, "learning_rate": 0.00021, "loss": 8.4649, "step": 105 }, { "epoch": 0.014385666644870202, "grad_norm": 1.34375, "learning_rate": 0.00022, "loss": 8.408, "step": 110 }, { "epoch": 0.015039560583273394, "grad_norm": 1.4296875, "learning_rate": 0.00023, "loss": 8.3359, "step": 115 }, { "epoch": 0.015693454521676583, "grad_norm": 1.3671875, "learning_rate": 0.00024, "loss": 8.3483, "step": 120 }, { "epoch": 0.016347348460079774, "grad_norm": 1.484375, "learning_rate": 0.00025, "loss": 8.3056, "step": 125 }, { "epoch": 0.017001242398482965, "grad_norm": 1.390625, "learning_rate": 0.00026000000000000003, "loss": 8.3724, "step": 130 }, { "epoch": 0.017655136336886156, "grad_norm": 2.171875, "learning_rate": 0.00027, "loss": 8.3476, "step": 135 }, { "epoch": 0.018309030275289347, "grad_norm": 1.421875, "learning_rate": 0.00028000000000000003, "loss": 8.2929, "step": 140 }, { "epoch": 0.01896292421369254, "grad_norm": 1.6875, "learning_rate": 0.00029, "loss": 8.2493, "step": 145 }, { "epoch": 0.01961681815209573, "grad_norm": 1.3125, "learning_rate": 0.0003, "loss": 8.291, "step": 150 }, { "epoch": 0.02027071209049892, "grad_norm": 1.3046875, "learning_rate": 0.00031, "loss": 8.1787, "step": 155 }, { "epoch": 0.02092460602890211, "grad_norm": 1.4375, "learning_rate": 0.00032, "loss": 8.2745, "step": 160 }, { "epoch": 0.021578499967305303, "grad_norm": 1.484375, "learning_rate": 0.00033, "loss": 8.1676, "step": 165 }, { "epoch": 0.022232393905708494, "grad_norm": 1.453125, "learning_rate": 0.00034, "loss": 8.1576, "step": 170 }, { "epoch": 0.022886287844111685, "grad_norm": 1.515625, "learning_rate": 0.00035, "loss": 8.1338, "step": 175 }, { "epoch": 0.023540181782514876, "grad_norm": 1.34375, "learning_rate": 0.00035999999999999997, "loss": 8.0981, "step": 180 }, { "epoch": 0.024194075720918067, "grad_norm": 1.8515625, "learning_rate": 0.00037, "loss": 8.1553, "step": 185 }, { "epoch": 0.02484796965932126, "grad_norm": 2.0625, "learning_rate": 0.00038, "loss": 8.0505, "step": 190 }, { "epoch": 0.02550186359772445, "grad_norm": 1.4140625, "learning_rate": 0.00039000000000000005, "loss": 8.0276, "step": 195 }, { "epoch": 0.02615575753612764, "grad_norm": 1.46875, "learning_rate": 0.0004, "loss": 8.026, "step": 200 }, { "epoch": 0.02680965147453083, "grad_norm": 1.453125, "learning_rate": 0.00041, "loss": 8.0615, "step": 205 }, { "epoch": 0.027463545412934023, "grad_norm": 1.6171875, "learning_rate": 0.00042, "loss": 8.0192, "step": 210 }, { "epoch": 0.028117439351337214, "grad_norm": 1.5390625, "learning_rate": 0.00043, "loss": 7.9596, "step": 215 }, { "epoch": 0.028771333289740405, "grad_norm": 1.484375, "learning_rate": 0.00044, "loss": 7.9729, "step": 220 }, { "epoch": 0.029425227228143596, "grad_norm": 1.5078125, "learning_rate": 0.00045000000000000004, "loss": 7.9676, "step": 225 }, { "epoch": 0.030079121166546787, "grad_norm": 1.703125, "learning_rate": 0.00046, "loss": 7.9347, "step": 230 }, { "epoch": 0.030733015104949978, "grad_norm": 1.875, "learning_rate": 0.00047, "loss": 7.9303, "step": 235 }, { "epoch": 0.031386909043353166, "grad_norm": 1.671875, "learning_rate": 0.00048, "loss": 7.9393, "step": 240 }, { "epoch": 0.03204080298175636, "grad_norm": 1.59375, "learning_rate": 0.00049, "loss": 7.8751, "step": 245 }, { "epoch": 0.03269469692015955, "grad_norm": 1.5703125, "learning_rate": 0.0005, "loss": 7.9601, "step": 250 }, { "epoch": 0.03334859085856274, "grad_norm": 1.4765625, "learning_rate": 0.00051, "loss": 7.8759, "step": 255 }, { "epoch": 0.03400248479696593, "grad_norm": 1.6796875, "learning_rate": 0.0005200000000000001, "loss": 7.8174, "step": 260 }, { "epoch": 0.03465637873536912, "grad_norm": 1.4609375, "learning_rate": 0.0005300000000000001, "loss": 7.8309, "step": 265 }, { "epoch": 0.03531027267377231, "grad_norm": 1.90625, "learning_rate": 0.00054, "loss": 7.8638, "step": 270 }, { "epoch": 0.035964166612175504, "grad_norm": 1.6875, "learning_rate": 0.00055, "loss": 7.8285, "step": 275 }, { "epoch": 0.036618060550578695, "grad_norm": 1.5703125, "learning_rate": 0.0005600000000000001, "loss": 7.8669, "step": 280 }, { "epoch": 0.037271954488981886, "grad_norm": 1.5, "learning_rate": 0.00057, "loss": 7.8088, "step": 285 }, { "epoch": 0.03792584842738508, "grad_norm": 1.5234375, "learning_rate": 0.00058, "loss": 7.7459, "step": 290 }, { "epoch": 0.03857974236578827, "grad_norm": 1.6015625, "learning_rate": 0.00059, "loss": 7.7809, "step": 295 }, { "epoch": 0.03923363630419146, "grad_norm": 1.5703125, "learning_rate": 0.0006, "loss": 7.7242, "step": 300 }, { "epoch": 0.03988753024259465, "grad_norm": 1.5625, "learning_rate": 0.00061, "loss": 7.7242, "step": 305 }, { "epoch": 0.04054142418099784, "grad_norm": 1.625, "learning_rate": 0.00062, "loss": 7.7411, "step": 310 }, { "epoch": 0.04119531811940103, "grad_norm": 1.4921875, "learning_rate": 0.00063, "loss": 7.6911, "step": 315 }, { "epoch": 0.04184921205780422, "grad_norm": 1.4296875, "learning_rate": 0.00064, "loss": 7.6351, "step": 320 }, { "epoch": 0.042503105996207415, "grad_norm": 1.5078125, "learning_rate": 0.0006500000000000001, "loss": 7.6582, "step": 325 }, { "epoch": 0.043156999934610606, "grad_norm": 1.6640625, "learning_rate": 0.00066, "loss": 7.6329, "step": 330 }, { "epoch": 0.0438108938730138, "grad_norm": 1.6875, "learning_rate": 0.00067, "loss": 7.6285, "step": 335 }, { "epoch": 0.04446478781141699, "grad_norm": 1.5, "learning_rate": 0.00068, "loss": 7.6687, "step": 340 }, { "epoch": 0.04511868174982018, "grad_norm": 1.6484375, "learning_rate": 0.00069, "loss": 7.6195, "step": 345 }, { "epoch": 0.04577257568822337, "grad_norm": 1.5390625, "learning_rate": 0.0007, "loss": 7.5768, "step": 350 }, { "epoch": 0.04642646962662656, "grad_norm": 1.9609375, "learning_rate": 0.00071, "loss": 7.6513, "step": 355 }, { "epoch": 0.04708036356502975, "grad_norm": 1.6953125, "learning_rate": 0.0007199999999999999, "loss": 7.587, "step": 360 }, { "epoch": 0.04773425750343294, "grad_norm": 1.9765625, "learning_rate": 0.00073, "loss": 7.5975, "step": 365 }, { "epoch": 0.048388151441836134, "grad_norm": 1.5546875, "learning_rate": 0.00074, "loss": 7.565, "step": 370 }, { "epoch": 0.049042045380239326, "grad_norm": 1.515625, "learning_rate": 0.00075, "loss": 7.5604, "step": 375 }, { "epoch": 0.04969593931864252, "grad_norm": 1.515625, "learning_rate": 0.00076, "loss": 7.5916, "step": 380 }, { "epoch": 0.05034983325704571, "grad_norm": 1.4765625, "learning_rate": 0.0007700000000000001, "loss": 7.5601, "step": 385 }, { "epoch": 0.0510037271954489, "grad_norm": 1.5546875, "learning_rate": 0.0007800000000000001, "loss": 7.5528, "step": 390 }, { "epoch": 0.05165762113385209, "grad_norm": 1.7109375, "learning_rate": 0.00079, "loss": 7.5534, "step": 395 }, { "epoch": 0.05231151507225528, "grad_norm": 1.4765625, "learning_rate": 0.0008, "loss": 7.5243, "step": 400 }, { "epoch": 0.05296540901065847, "grad_norm": 1.5703125, "learning_rate": 0.0008100000000000001, "loss": 7.5231, "step": 405 }, { "epoch": 0.05361930294906166, "grad_norm": 1.46875, "learning_rate": 0.00082, "loss": 7.5174, "step": 410 }, { "epoch": 0.054273196887464854, "grad_norm": 1.46875, "learning_rate": 0.00083, "loss": 7.4922, "step": 415 }, { "epoch": 0.054927090825868045, "grad_norm": 1.6484375, "learning_rate": 0.00084, "loss": 7.3765, "step": 420 }, { "epoch": 0.055580984764271237, "grad_norm": 1.609375, "learning_rate": 0.00085, "loss": 7.4035, "step": 425 }, { "epoch": 0.05623487870267443, "grad_norm": 1.53125, "learning_rate": 0.00086, "loss": 7.4614, "step": 430 }, { "epoch": 0.05688877264107762, "grad_norm": 1.53125, "learning_rate": 0.00087, "loss": 7.4449, "step": 435 }, { "epoch": 0.05754266657948081, "grad_norm": 1.5859375, "learning_rate": 0.00088, "loss": 7.5239, "step": 440 }, { "epoch": 0.058196560517884, "grad_norm": 1.5, "learning_rate": 0.0008900000000000001, "loss": 7.3661, "step": 445 }, { "epoch": 0.05885045445628719, "grad_norm": 1.53125, "learning_rate": 0.0009000000000000001, "loss": 7.3652, "step": 450 }, { "epoch": 0.05950434839469038, "grad_norm": 1.296875, "learning_rate": 0.00091, "loss": 7.4256, "step": 455 }, { "epoch": 0.060158242333093574, "grad_norm": 1.5546875, "learning_rate": 0.00092, "loss": 7.4048, "step": 460 }, { "epoch": 0.060812136271496765, "grad_norm": 1.5234375, "learning_rate": 0.00093, "loss": 7.4004, "step": 465 }, { "epoch": 0.061466030209899956, "grad_norm": 1.390625, "learning_rate": 0.00094, "loss": 7.3547, "step": 470 }, { "epoch": 0.06211992414830315, "grad_norm": 1.5234375, "learning_rate": 0.00095, "loss": 7.4078, "step": 475 }, { "epoch": 0.06277381808670633, "grad_norm": 1.46875, "learning_rate": 0.00096, "loss": 7.3878, "step": 480 }, { "epoch": 0.06342771202510952, "grad_norm": 1.453125, "learning_rate": 0.0009699999999999999, "loss": 7.3434, "step": 485 }, { "epoch": 0.06408160596351271, "grad_norm": 1.625, "learning_rate": 0.00098, "loss": 7.3696, "step": 490 }, { "epoch": 0.0647354999019159, "grad_norm": 1.4453125, "learning_rate": 0.00099, "loss": 7.3436, "step": 495 }, { "epoch": 0.0653893938403191, "grad_norm": 1.4921875, "learning_rate": 0.001, "loss": 7.2601, "step": 500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 670418530467840.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }