{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0653893938403191, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000653893938403191, "grad_norm": 8.4375, "learning_rate": 1e-05, "loss": 10.9218, "step": 5 }, { "epoch": 0.001307787876806382, "grad_norm": 8.5, "learning_rate": 2e-05, "loss": 10.86, "step": 10 }, { "epoch": 0.001961681815209573, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 10.6568, "step": 15 }, { "epoch": 0.002615575753612764, "grad_norm": 3.4375, "learning_rate": 4e-05, "loss": 10.4622, "step": 20 }, { "epoch": 0.003269469692015955, "grad_norm": 3.0625, "learning_rate": 5e-05, "loss": 10.2882, "step": 25 }, { "epoch": 0.003923363630419146, "grad_norm": 3.09375, "learning_rate": 6e-05, "loss": 10.2014, "step": 30 }, { "epoch": 0.004577257568822337, "grad_norm": 2.703125, "learning_rate": 7.000000000000001e-05, "loss": 10.0942, "step": 35 }, { "epoch": 0.005231151507225528, "grad_norm": 2.984375, "learning_rate": 8e-05, "loss": 9.9471, "step": 40 }, { "epoch": 0.005885045445628719, "grad_norm": 2.703125, "learning_rate": 8.999999999999999e-05, "loss": 9.7595, "step": 45 }, { "epoch": 0.00653893938403191, "grad_norm": 2.46875, "learning_rate": 0.0001, "loss": 9.6038, "step": 50 }, { "epoch": 0.007192833322435101, "grad_norm": 2.53125, "learning_rate": 0.00011, "loss": 9.3898, "step": 55 }, { "epoch": 0.007846727260838291, "grad_norm": 2.125, "learning_rate": 0.00012, "loss": 9.2079, "step": 60 }, { "epoch": 0.008500621199241483, "grad_norm": 1.8046875, "learning_rate": 0.00013000000000000002, "loss": 9.0458, "step": 65 }, { "epoch": 0.009154515137644674, "grad_norm": 1.734375, "learning_rate": 0.00014000000000000001, "loss": 8.8588, "step": 70 }, { "epoch": 0.009808409076047865, "grad_norm": 1.34375, "learning_rate": 0.00015, "loss": 8.7008, "step": 75 }, { "epoch": 0.010462303014451056, "grad_norm": 1.109375, "learning_rate": 0.00016, "loss": 8.5939, "step": 80 }, { "epoch": 0.011116196952854247, "grad_norm": 1.1640625, "learning_rate": 0.00017, "loss": 8.5616, "step": 85 }, { "epoch": 0.011770090891257438, "grad_norm": 1.3984375, "learning_rate": 0.00017999999999999998, "loss": 8.4525, "step": 90 }, { "epoch": 0.01242398482966063, "grad_norm": 1.109375, "learning_rate": 0.00019, "loss": 8.4594, "step": 95 }, { "epoch": 0.01307787876806382, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 8.4348, "step": 100 }, { "epoch": 0.013731772706467011, "grad_norm": 1.4375, "learning_rate": 0.00021, "loss": 8.4187, "step": 105 }, { "epoch": 0.014385666644870202, "grad_norm": 1.78125, "learning_rate": 0.00022, "loss": 8.3887, "step": 110 }, { "epoch": 0.015039560583273394, "grad_norm": 1.375, "learning_rate": 0.00023, "loss": 8.3935, "step": 115 }, { "epoch": 0.015693454521676583, "grad_norm": 1.4453125, "learning_rate": 0.00024, "loss": 8.3468, "step": 120 }, { "epoch": 0.016347348460079774, "grad_norm": 1.5859375, "learning_rate": 0.00025, "loss": 8.3648, "step": 125 }, { "epoch": 0.017001242398482965, "grad_norm": 1.8046875, "learning_rate": 0.00026000000000000003, "loss": 8.3006, "step": 130 }, { "epoch": 0.017655136336886156, "grad_norm": 1.421875, "learning_rate": 0.00027, "loss": 8.2614, "step": 135 }, { "epoch": 0.018309030275289347, "grad_norm": 1.3828125, "learning_rate": 0.00028000000000000003, "loss": 8.25, "step": 140 }, { "epoch": 0.01896292421369254, "grad_norm": 1.90625, "learning_rate": 0.00029, "loss": 8.2765, "step": 145 }, { "epoch": 0.01961681815209573, "grad_norm": 1.4296875, "learning_rate": 0.0003, "loss": 8.2609, "step": 150 }, { "epoch": 0.02027071209049892, "grad_norm": 1.703125, "learning_rate": 0.00031, "loss": 8.1702, "step": 155 }, { "epoch": 0.02092460602890211, "grad_norm": 1.53125, "learning_rate": 0.00032, "loss": 8.1875, "step": 160 }, { "epoch": 0.021578499967305303, "grad_norm": 1.5, "learning_rate": 0.00033, "loss": 8.1725, "step": 165 }, { "epoch": 0.022232393905708494, "grad_norm": 1.9140625, "learning_rate": 0.00034, "loss": 8.1809, "step": 170 }, { "epoch": 0.022886287844111685, "grad_norm": 1.5703125, "learning_rate": 0.00035, "loss": 8.1357, "step": 175 }, { "epoch": 0.023540181782514876, "grad_norm": 1.453125, "learning_rate": 0.00035999999999999997, "loss": 8.1805, "step": 180 }, { "epoch": 0.024194075720918067, "grad_norm": 1.84375, "learning_rate": 0.00037, "loss": 8.0948, "step": 185 }, { "epoch": 0.02484796965932126, "grad_norm": 1.65625, "learning_rate": 0.00038, "loss": 8.0769, "step": 190 }, { "epoch": 0.02550186359772445, "grad_norm": 1.4375, "learning_rate": 0.00039000000000000005, "loss": 8.0623, "step": 195 }, { "epoch": 0.02615575753612764, "grad_norm": 1.375, "learning_rate": 0.0004, "loss": 8.0233, "step": 200 }, { "epoch": 0.02680965147453083, "grad_norm": 1.65625, "learning_rate": 0.00041, "loss": 8.0448, "step": 205 }, { "epoch": 0.027463545412934023, "grad_norm": 1.421875, "learning_rate": 0.00042, "loss": 7.9827, "step": 210 }, { "epoch": 0.028117439351337214, "grad_norm": 1.8125, "learning_rate": 0.00043, "loss": 7.9879, "step": 215 }, { "epoch": 0.028771333289740405, "grad_norm": 1.484375, "learning_rate": 0.00044, "loss": 7.9859, "step": 220 }, { "epoch": 0.029425227228143596, "grad_norm": 1.40625, "learning_rate": 0.00045000000000000004, "loss": 7.9821, "step": 225 }, { "epoch": 0.030079121166546787, "grad_norm": 1.421875, "learning_rate": 0.00046, "loss": 7.9357, "step": 230 }, { "epoch": 0.030733015104949978, "grad_norm": 1.375, "learning_rate": 0.00047, "loss": 7.9613, "step": 235 }, { "epoch": 0.031386909043353166, "grad_norm": 1.96875, "learning_rate": 0.00048, "loss": 7.8928, "step": 240 }, { "epoch": 0.03204080298175636, "grad_norm": 1.5859375, "learning_rate": 0.00049, "loss": 7.9647, "step": 245 }, { "epoch": 0.03269469692015955, "grad_norm": 1.6875, "learning_rate": 0.0005, "loss": 7.8866, "step": 250 }, { "epoch": 0.03334859085856274, "grad_norm": 1.5703125, "learning_rate": 0.00051, "loss": 7.8719, "step": 255 }, { "epoch": 0.03400248479696593, "grad_norm": 1.4609375, "learning_rate": 0.0005200000000000001, "loss": 7.8036, "step": 260 }, { "epoch": 0.03465637873536912, "grad_norm": 1.53125, "learning_rate": 0.0005300000000000001, "loss": 7.8523, "step": 265 }, { "epoch": 0.03531027267377231, "grad_norm": 1.46875, "learning_rate": 0.00054, "loss": 7.8359, "step": 270 }, { "epoch": 0.035964166612175504, "grad_norm": 1.65625, "learning_rate": 0.00055, "loss": 7.8399, "step": 275 }, { "epoch": 0.036618060550578695, "grad_norm": 1.65625, "learning_rate": 0.0005600000000000001, "loss": 7.8428, "step": 280 }, { "epoch": 0.037271954488981886, "grad_norm": 1.7421875, "learning_rate": 0.00057, "loss": 7.7551, "step": 285 }, { "epoch": 0.03792584842738508, "grad_norm": 1.4765625, "learning_rate": 0.00058, "loss": 7.7357, "step": 290 }, { "epoch": 0.03857974236578827, "grad_norm": 1.375, "learning_rate": 0.00059, "loss": 7.7327, "step": 295 }, { "epoch": 0.03923363630419146, "grad_norm": 1.5859375, "learning_rate": 0.0006, "loss": 7.6911, "step": 300 }, { "epoch": 0.03988753024259465, "grad_norm": 1.3359375, "learning_rate": 0.00061, "loss": 7.6854, "step": 305 }, { "epoch": 0.04054142418099784, "grad_norm": 1.4921875, "learning_rate": 0.00062, "loss": 7.7088, "step": 310 }, { "epoch": 0.04119531811940103, "grad_norm": 1.59375, "learning_rate": 0.00063, "loss": 7.6666, "step": 315 }, { "epoch": 0.04184921205780422, "grad_norm": 1.4921875, "learning_rate": 0.00064, "loss": 7.6758, "step": 320 }, { "epoch": 0.042503105996207415, "grad_norm": 1.46875, "learning_rate": 0.0006500000000000001, "loss": 7.6554, "step": 325 }, { "epoch": 0.043156999934610606, "grad_norm": 1.6328125, "learning_rate": 0.00066, "loss": 7.6471, "step": 330 }, { "epoch": 0.0438108938730138, "grad_norm": 1.546875, "learning_rate": 0.00067, "loss": 7.612, "step": 335 }, { "epoch": 0.04446478781141699, "grad_norm": 1.546875, "learning_rate": 0.00068, "loss": 7.6353, "step": 340 }, { "epoch": 0.04511868174982018, "grad_norm": 1.5859375, "learning_rate": 0.00069, "loss": 7.6068, "step": 345 }, { "epoch": 0.04577257568822337, "grad_norm": 1.640625, "learning_rate": 0.0007, "loss": 7.6507, "step": 350 }, { "epoch": 0.04642646962662656, "grad_norm": 1.6328125, "learning_rate": 0.00071, "loss": 7.5806, "step": 355 }, { "epoch": 0.04708036356502975, "grad_norm": 1.6328125, "learning_rate": 0.0007199999999999999, "loss": 7.5964, "step": 360 }, { "epoch": 0.04773425750343294, "grad_norm": 1.484375, "learning_rate": 0.00073, "loss": 7.5817, "step": 365 }, { "epoch": 0.048388151441836134, "grad_norm": 1.4375, "learning_rate": 0.00074, "loss": 7.5673, "step": 370 }, { "epoch": 0.049042045380239326, "grad_norm": 1.6015625, "learning_rate": 0.00075, "loss": 7.5662, "step": 375 }, { "epoch": 0.04969593931864252, "grad_norm": 1.5234375, "learning_rate": 0.00076, "loss": 7.5739, "step": 380 }, { "epoch": 0.05034983325704571, "grad_norm": 1.6328125, "learning_rate": 0.0007700000000000001, "loss": 7.5809, "step": 385 }, { "epoch": 0.0510037271954489, "grad_norm": 1.6328125, "learning_rate": 0.0007800000000000001, "loss": 7.584, "step": 390 }, { "epoch": 0.05165762113385209, "grad_norm": 1.8671875, "learning_rate": 0.00079, "loss": 7.4952, "step": 395 }, { "epoch": 0.05231151507225528, "grad_norm": 1.6328125, "learning_rate": 0.0008, "loss": 7.5619, "step": 400 }, { "epoch": 0.05296540901065847, "grad_norm": 1.4609375, "learning_rate": 0.0008100000000000001, "loss": 7.5143, "step": 405 }, { "epoch": 0.05361930294906166, "grad_norm": 1.59375, "learning_rate": 0.00082, "loss": 7.429, "step": 410 }, { "epoch": 0.054273196887464854, "grad_norm": 1.5625, "learning_rate": 0.00083, "loss": 7.5304, "step": 415 }, { "epoch": 0.054927090825868045, "grad_norm": 1.515625, "learning_rate": 0.00084, "loss": 7.4544, "step": 420 }, { "epoch": 0.055580984764271237, "grad_norm": 1.5546875, "learning_rate": 0.00085, "loss": 7.4307, "step": 425 }, { "epoch": 0.05623487870267443, "grad_norm": 1.5078125, "learning_rate": 0.00086, "loss": 7.4134, "step": 430 }, { "epoch": 0.05688877264107762, "grad_norm": 1.4765625, "learning_rate": 0.00087, "loss": 7.5005, "step": 435 }, { "epoch": 0.05754266657948081, "grad_norm": 1.421875, "learning_rate": 0.00088, "loss": 7.4401, "step": 440 }, { "epoch": 0.058196560517884, "grad_norm": 1.671875, "learning_rate": 0.0008900000000000001, "loss": 7.4792, "step": 445 }, { "epoch": 0.05885045445628719, "grad_norm": 1.5, "learning_rate": 0.0009000000000000001, "loss": 7.405, "step": 450 }, { "epoch": 0.05950434839469038, "grad_norm": 1.5625, "learning_rate": 0.00091, "loss": 7.4043, "step": 455 }, { "epoch": 0.060158242333093574, "grad_norm": 1.5, "learning_rate": 0.00092, "loss": 7.4356, "step": 460 }, { "epoch": 0.060812136271496765, "grad_norm": 1.4453125, "learning_rate": 0.00093, "loss": 7.2855, "step": 465 }, { "epoch": 0.061466030209899956, "grad_norm": 1.6015625, "learning_rate": 0.00094, "loss": 7.3614, "step": 470 }, { "epoch": 0.06211992414830315, "grad_norm": 1.6171875, "learning_rate": 0.00095, "loss": 7.4136, "step": 475 }, { "epoch": 0.06277381808670633, "grad_norm": 1.5546875, "learning_rate": 0.00096, "loss": 7.3571, "step": 480 }, { "epoch": 0.06342771202510952, "grad_norm": 1.6015625, "learning_rate": 0.0009699999999999999, "loss": 7.3855, "step": 485 }, { "epoch": 0.06408160596351271, "grad_norm": 1.6875, "learning_rate": 0.00098, "loss": 7.5011, "step": 490 }, { "epoch": 0.0647354999019159, "grad_norm": 1.578125, "learning_rate": 0.00099, "loss": 7.3837, "step": 495 }, { "epoch": 0.0653893938403191, "grad_norm": 1.5859375, "learning_rate": 0.001, "loss": 7.2856, "step": 500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 675529741762560.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }