9300 lines
214 KiB
JSON
9300 lines
214 KiB
JSON
{
|
|
"best_global_step": 12540,
|
|
"best_metric": 0.15973736345767975,
|
|
"best_model_checkpoint": "models/qwen3-0.6b-distilled/checkpoint-12540",
|
|
"epoch": 1.9959014762643745,
|
|
"eval_steps": 209,
|
|
"global_step": 12540,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00015916597031554654,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.714,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0015916597031554653,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 1.8000000000000001e-06,
|
|
"loss": 0.7182,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0031833194063109306,
|
|
"grad_norm": 13.5625,
|
|
"learning_rate": 3.8000000000000005e-06,
|
|
"loss": 0.6683,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.004774979109466396,
|
|
"grad_norm": 16.5,
|
|
"learning_rate": 5.8e-06,
|
|
"loss": 0.5995,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.006366638812621861,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 7.800000000000002e-06,
|
|
"loss": 0.3777,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.007958298515777326,
|
|
"grad_norm": 6.875,
|
|
"learning_rate": 9.800000000000001e-06,
|
|
"loss": 0.2771,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.009549958218932793,
|
|
"grad_norm": 9.125,
|
|
"learning_rate": 1.18e-05,
|
|
"loss": 0.3166,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.011141617922088258,
|
|
"grad_norm": 6.59375,
|
|
"learning_rate": 1.38e-05,
|
|
"loss": 0.2878,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.012733277625243723,
|
|
"grad_norm": 7.875,
|
|
"learning_rate": 1.58e-05,
|
|
"loss": 0.3306,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.014324937328399187,
|
|
"grad_norm": 11.1875,
|
|
"learning_rate": 1.7800000000000002e-05,
|
|
"loss": 0.3323,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.015916597031554652,
|
|
"grad_norm": 6.75,
|
|
"learning_rate": 1.98e-05,
|
|
"loss": 0.2828,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.01750825673471012,
|
|
"grad_norm": 6.28125,
|
|
"learning_rate": 1.998556072517247e-05,
|
|
"loss": 0.3218,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.019099916437865586,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 1.9969517086475215e-05,
|
|
"loss": 0.2328,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.02069157614102105,
|
|
"grad_norm": 7.65625,
|
|
"learning_rate": 1.9953473447777958e-05,
|
|
"loss": 0.2444,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.022283235844176515,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.99374298090807e-05,
|
|
"loss": 0.2528,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.023874895547331982,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 1.9921386170383446e-05,
|
|
"loss": 0.2828,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.025466555250487445,
|
|
"grad_norm": 8.0625,
|
|
"learning_rate": 1.9905342531686188e-05,
|
|
"loss": 0.2027,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.027058214953642912,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 1.988929889298893e-05,
|
|
"loss": 0.2844,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.028649874656798375,
|
|
"grad_norm": 6.71875,
|
|
"learning_rate": 1.9873255254291673e-05,
|
|
"loss": 0.3,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.03024153435995384,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 1.985721161559442e-05,
|
|
"loss": 0.2529,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.031833194063109305,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 1.984116797689716e-05,
|
|
"loss": 0.2419,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.033265687795949225,
|
|
"eval_loss": 0.24636909365653992,
|
|
"eval_runtime": 20.1286,
|
|
"eval_samples_per_second": 12.619,
|
|
"eval_steps_per_second": 12.619,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.03342485376626477,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 1.9825124338199907e-05,
|
|
"loss": 0.2255,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.03501651346942024,
|
|
"grad_norm": 7.78125,
|
|
"learning_rate": 1.980908069950265e-05,
|
|
"loss": 0.3541,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.036608173172575705,
|
|
"grad_norm": 8.375,
|
|
"learning_rate": 1.979303706080539e-05,
|
|
"loss": 0.2831,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.03819983287573117,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 1.9776993422108134e-05,
|
|
"loss": 0.2478,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.03979149257888663,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 1.976094978341088e-05,
|
|
"loss": 0.2631,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.0413831522820421,
|
|
"grad_norm": 7.46875,
|
|
"learning_rate": 1.9744906144713622e-05,
|
|
"loss": 0.2542,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.042974811985197564,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 1.9728862506016367e-05,
|
|
"loss": 0.2217,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.04456647168835303,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.971281886731911e-05,
|
|
"loss": 0.2627,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.0461581313915085,
|
|
"grad_norm": 8.1875,
|
|
"learning_rate": 1.9696775228621856e-05,
|
|
"loss": 0.2786,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.047749791094663964,
|
|
"grad_norm": 6.6875,
|
|
"learning_rate": 1.9680731589924594e-05,
|
|
"loss": 0.3167,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.049341450797819424,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.966468795122734e-05,
|
|
"loss": 0.2918,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.05093311050097489,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.9648644312530083e-05,
|
|
"loss": 0.2285,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.05252477020413036,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 1.9632600673832828e-05,
|
|
"loss": 0.2076,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.054116429907285823,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 1.961655703513557e-05,
|
|
"loss": 0.2452,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.05570808961044129,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 1.9600513396438313e-05,
|
|
"loss": 0.235,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.05729974931359675,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.958446975774106e-05,
|
|
"loss": 0.2586,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.058891409016752216,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 1.95684261190438e-05,
|
|
"loss": 0.2479,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.06048306871990768,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.9552382480346543e-05,
|
|
"loss": 0.2164,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.06207472842306315,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 1.9536338841649286e-05,
|
|
"loss": 0.207,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.06366638812621861,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.952029520295203e-05,
|
|
"loss": 0.3306,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.06525804782937408,
|
|
"grad_norm": 16.0,
|
|
"learning_rate": 1.9504251564254774e-05,
|
|
"loss": 0.269,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.06653137559189845,
|
|
"eval_loss": 0.227196604013443,
|
|
"eval_runtime": 17.2442,
|
|
"eval_samples_per_second": 14.73,
|
|
"eval_steps_per_second": 14.73,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.06684970753252954,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.948820792555752e-05,
|
|
"loss": 0.2425,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.06844136723568502,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 1.9472164286860262e-05,
|
|
"loss": 0.2309,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.07003302693884048,
|
|
"grad_norm": 8.5625,
|
|
"learning_rate": 1.9456120648163008e-05,
|
|
"loss": 0.2422,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.07162468664199594,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 1.9440077009465747e-05,
|
|
"loss": 0.2449,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.07321634634515141,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.9424033370768492e-05,
|
|
"loss": 0.2563,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.07480800604830687,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 1.9407989732071235e-05,
|
|
"loss": 0.281,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.07639966575146234,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 1.939194609337398e-05,
|
|
"loss": 0.2911,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.0779913254546178,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.9375902454676723e-05,
|
|
"loss": 0.2553,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.07958298515777326,
|
|
"grad_norm": 7.34375,
|
|
"learning_rate": 1.9359858815979465e-05,
|
|
"loss": 0.216,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.08117464486092874,
|
|
"grad_norm": 7.46875,
|
|
"learning_rate": 1.934381517728221e-05,
|
|
"loss": 0.2502,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.0827663045640842,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.9327771538584953e-05,
|
|
"loss": 0.2115,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.08435796426723967,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.9311727899887695e-05,
|
|
"loss": 0.2595,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.08594962397039513,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.9295684261190438e-05,
|
|
"loss": 0.3313,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.0875412836735506,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 1.9279640622493183e-05,
|
|
"loss": 0.2695,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.08913294337670606,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 1.9263596983795926e-05,
|
|
"loss": 0.2594,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.09072460307986152,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 1.924755334509867e-05,
|
|
"loss": 0.2587,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.092316262783017,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.9231509706401414e-05,
|
|
"loss": 0.2278,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.09390792248617245,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 1.9215466067704156e-05,
|
|
"loss": 0.1742,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.09549958218932793,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.91994224290069e-05,
|
|
"loss": 0.2182,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.09709124189248339,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 1.9183378790309644e-05,
|
|
"loss": 0.219,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.09868290159563885,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 1.9167335151612387e-05,
|
|
"loss": 0.2106,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.09979706338784768,
|
|
"eval_loss": 0.22220537066459656,
|
|
"eval_runtime": 19.8681,
|
|
"eval_samples_per_second": 12.784,
|
|
"eval_steps_per_second": 12.784,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 0.10027456129879432,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.9151291512915132e-05,
|
|
"loss": 0.2479,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.10186622100194978,
|
|
"grad_norm": 13.25,
|
|
"learning_rate": 1.9135247874217875e-05,
|
|
"loss": 0.2224,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.10345788070510525,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 1.9119204235520617e-05,
|
|
"loss": 0.2294,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.10504954040826071,
|
|
"grad_norm": 6.625,
|
|
"learning_rate": 1.910316059682336e-05,
|
|
"loss": 0.2919,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.10664120011141617,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.9087116958126105e-05,
|
|
"loss": 0.2166,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.10823285981457165,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 1.9071073319428847e-05,
|
|
"loss": 0.2304,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.1098245195177271,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.9055029680731593e-05,
|
|
"loss": 0.1778,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.11141617922088258,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 1.9038986042034335e-05,
|
|
"loss": 0.2802,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.11300783892403804,
|
|
"grad_norm": 6.84375,
|
|
"learning_rate": 1.9022942403337078e-05,
|
|
"loss": 0.2136,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.1145994986271935,
|
|
"grad_norm": 7.5,
|
|
"learning_rate": 1.9006898764639824e-05,
|
|
"loss": 0.2123,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.11619115833034897,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 1.8990855125942566e-05,
|
|
"loss": 0.2375,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.11778281803350443,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.8974811487245308e-05,
|
|
"loss": 0.2449,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.1193744777366599,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 1.895876784854805e-05,
|
|
"loss": 0.224,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.12096613743981537,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 1.8942724209850796e-05,
|
|
"loss": 0.1943,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.12255779714297084,
|
|
"grad_norm": 6.78125,
|
|
"learning_rate": 1.892668057115354e-05,
|
|
"loss": 0.2475,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.1241494568461263,
|
|
"grad_norm": 7.6875,
|
|
"learning_rate": 1.8910636932456284e-05,
|
|
"loss": 0.2325,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.12574111654928177,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.8894593293759027e-05,
|
|
"loss": 0.239,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.12733277625243722,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.887854965506177e-05,
|
|
"loss": 0.2311,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.1289244359555927,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.886250601636451e-05,
|
|
"loss": 0.2037,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.13051609565874817,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.8846462377667257e-05,
|
|
"loss": 0.1883,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.1321077553619036,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 1.883041873897e-05,
|
|
"loss": 0.2256,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.1330627511837969,
|
|
"eval_loss": 0.2150929719209671,
|
|
"eval_runtime": 17.2112,
|
|
"eval_samples_per_second": 14.758,
|
|
"eval_steps_per_second": 14.758,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 0.13369941506505909,
|
|
"grad_norm": 7.96875,
|
|
"learning_rate": 1.8814375100272745e-05,
|
|
"loss": 0.324,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.13529107476821456,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.8798331461575487e-05,
|
|
"loss": 0.2214,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.13688273447137003,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 1.878228782287823e-05,
|
|
"loss": 0.2662,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.13847439417452548,
|
|
"grad_norm": 7.53125,
|
|
"learning_rate": 1.8766244184180972e-05,
|
|
"loss": 0.2221,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.14006605387768095,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 1.8750200545483718e-05,
|
|
"loss": 0.1886,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.14165771358083643,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 1.873415690678646e-05,
|
|
"loss": 0.2135,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.14324937328399187,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.8718113268089203e-05,
|
|
"loss": 0.2703,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.14484103298714734,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 1.8702069629391948e-05,
|
|
"loss": 0.2961,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.14643269269030282,
|
|
"grad_norm": 11.0,
|
|
"learning_rate": 1.868602599069469e-05,
|
|
"loss": 0.273,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.1480243523934583,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.8669982351997436e-05,
|
|
"loss": 0.2435,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.14961601209661374,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.865393871330018e-05,
|
|
"loss": 0.2907,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.1512076717997692,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 1.863789507460292e-05,
|
|
"loss": 0.2482,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.15279933150292468,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.8621851435905663e-05,
|
|
"loss": 0.1827,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.15439099120608013,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 1.860580779720841e-05,
|
|
"loss": 0.2043,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.1559826509092356,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 1.858976415851115e-05,
|
|
"loss": 0.1861,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.15757431061239108,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 1.8573720519813897e-05,
|
|
"loss": 0.2123,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.15916597031554652,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 1.855767688111664e-05,
|
|
"loss": 0.1825,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.160757630018702,
|
|
"grad_norm": 7.03125,
|
|
"learning_rate": 1.8541633242419382e-05,
|
|
"loss": 0.1794,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.16234928972185747,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.8525589603722124e-05,
|
|
"loss": 0.2458,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.16394094942501294,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.850954596502487e-05,
|
|
"loss": 0.2865,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.1655326091281684,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 1.8493502326327612e-05,
|
|
"loss": 0.2387,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.16632843897974614,
|
|
"eval_loss": 0.20574907958507538,
|
|
"eval_runtime": 17.2983,
|
|
"eval_samples_per_second": 14.684,
|
|
"eval_steps_per_second": 14.684,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.16712426883132386,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 1.8477458687630358e-05,
|
|
"loss": 0.199,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.16871592853447934,
|
|
"grad_norm": 7.0625,
|
|
"learning_rate": 1.84614150489331e-05,
|
|
"loss": 0.2283,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.17030758823763478,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 1.8445371410235843e-05,
|
|
"loss": 0.207,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.17189924794079026,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 1.8429327771538585e-05,
|
|
"loss": 0.2747,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.17349090764394573,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 1.841328413284133e-05,
|
|
"loss": 0.2368,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.1750825673471012,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.8397240494144073e-05,
|
|
"loss": 0.2298,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.17667422705025665,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 1.8381196855446815e-05,
|
|
"loss": 0.2593,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.17826588675341212,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.836515321674956e-05,
|
|
"loss": 0.2327,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.1798575464565676,
|
|
"grad_norm": 7.1875,
|
|
"learning_rate": 1.8349109578052303e-05,
|
|
"loss": 0.2218,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.18144920615972304,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 1.833306593935505e-05,
|
|
"loss": 0.1936,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.18304086586287852,
|
|
"grad_norm": 6.34375,
|
|
"learning_rate": 1.8317022300657788e-05,
|
|
"loss": 0.2331,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.184632525566034,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 1.8300978661960534e-05,
|
|
"loss": 0.222,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.18622418526918944,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 1.8284935023263276e-05,
|
|
"loss": 0.2065,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.1878158449723449,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 1.8268891384566022e-05,
|
|
"loss": 0.1914,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.18940750467550038,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.8252847745868764e-05,
|
|
"loss": 0.2619,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.19099916437865586,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.823680410717151e-05,
|
|
"loss": 0.21,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.1925908240818113,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.8220760468474252e-05,
|
|
"loss": 0.2211,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.19418248378496678,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 1.8204716829776995e-05,
|
|
"loss": 0.2627,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.19577414348812225,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.8188673191079737e-05,
|
|
"loss": 0.2194,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.1973658031912777,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.8172629552382483e-05,
|
|
"loss": 0.1924,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.19895746289443317,
|
|
"grad_norm": 7.21875,
|
|
"learning_rate": 1.8156585913685225e-05,
|
|
"loss": 0.211,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.19959412677569535,
|
|
"eval_loss": 0.20657111704349518,
|
|
"eval_runtime": 17.1409,
|
|
"eval_samples_per_second": 14.818,
|
|
"eval_steps_per_second": 14.818,
|
|
"step": 1254
|
|
},
|
|
{
|
|
"epoch": 0.20054912259758864,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 1.8140542274987967e-05,
|
|
"loss": 0.2339,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.2021407823007441,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 1.8124498636290713e-05,
|
|
"loss": 0.1866,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.20373244200389956,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 1.8108454997593455e-05,
|
|
"loss": 0.2403,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.20532410170705503,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.8092411358896198e-05,
|
|
"loss": 0.2024,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.2069157614102105,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 1.807636772019894e-05,
|
|
"loss": 0.1978,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.20850742111336595,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 1.8060324081501686e-05,
|
|
"loss": 0.2579,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.21009908081652143,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.8044280442804428e-05,
|
|
"loss": 0.2062,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.2116907405196769,
|
|
"grad_norm": 8.5,
|
|
"learning_rate": 1.8028236804107174e-05,
|
|
"loss": 0.2293,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.21328240022283235,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.8012193165409916e-05,
|
|
"loss": 0.2088,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.21487405992598782,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.7996149526712662e-05,
|
|
"loss": 0.1859,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.2164657196291433,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 1.7980105888015404e-05,
|
|
"loss": 0.2247,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.21805737933229877,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 1.7964062249318147e-05,
|
|
"loss": 0.1535,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.2196490390354542,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.794801861062089e-05,
|
|
"loss": 0.2199,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.2212406987386097,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.7931974971923635e-05,
|
|
"loss": 0.2314,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.22283235844176516,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 1.7915931333226377e-05,
|
|
"loss": 0.2339,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.2244240181449206,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.789988769452912e-05,
|
|
"loss": 0.2009,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.22601567784807608,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 1.7883844055831865e-05,
|
|
"loss": 0.2424,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.22760733755123155,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.7867800417134607e-05,
|
|
"loss": 0.2096,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.229198997254387,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 1.785175677843735e-05,
|
|
"loss": 0.1673,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.23079065695754247,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 1.7835713139740096e-05,
|
|
"loss": 0.1667,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.23238231666069795,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 1.7819669501042838e-05,
|
|
"loss": 0.199,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.2328598145716446,
|
|
"eval_loss": 0.20164676010608673,
|
|
"eval_runtime": 17.2958,
|
|
"eval_samples_per_second": 14.686,
|
|
"eval_steps_per_second": 14.686,
|
|
"step": 1463
|
|
},
|
|
{
|
|
"epoch": 0.23397397636385342,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.780362586234558e-05,
|
|
"loss": 0.2162,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.23556563606700887,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 1.7787582223648326e-05,
|
|
"loss": 0.1833,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.23715729577016434,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 1.7771538584951068e-05,
|
|
"loss": 0.2242,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.2387489554733198,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 1.7755494946253814e-05,
|
|
"loss": 0.2618,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.24034061517647526,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 1.7739451307556553e-05,
|
|
"loss": 0.1793,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.24193227487963073,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.77234076688593e-05,
|
|
"loss": 0.248,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.2435239345827862,
|
|
"grad_norm": 7.875,
|
|
"learning_rate": 1.770736403016204e-05,
|
|
"loss": 0.2114,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.24511559428594168,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.7691320391464787e-05,
|
|
"loss": 0.2143,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.24670725398909713,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.767527675276753e-05,
|
|
"loss": 0.2212,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.2482989136922526,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 1.7659233114070275e-05,
|
|
"loss": 0.2193,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.24989057339540807,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 1.7643189475373017e-05,
|
|
"loss": 0.1711,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.25148223309856355,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 1.762714583667576e-05,
|
|
"loss": 0.213,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.253073892801719,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.7611102197978502e-05,
|
|
"loss": 0.2355,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.25466555250487444,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 1.7595058559281248e-05,
|
|
"loss": 0.1682,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.25625721220802994,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 1.757901492058399e-05,
|
|
"loss": 0.2221,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.2578488719111854,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.7562971281886732e-05,
|
|
"loss": 0.1885,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.25944053161434083,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 1.7546927643189478e-05,
|
|
"loss": 0.2829,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.26103219131749633,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 1.753088400449222e-05,
|
|
"loss": 0.1795,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.2626238510206518,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.7514840365794963e-05,
|
|
"loss": 0.1677,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.2642155107238072,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.7498796727097705e-05,
|
|
"loss": 0.1979,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.2658071704269627,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 1.748275308840045e-05,
|
|
"loss": 0.2132,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.2661255023675938,
|
|
"eval_loss": 0.20001497864723206,
|
|
"eval_runtime": 17.5681,
|
|
"eval_samples_per_second": 14.458,
|
|
"eval_steps_per_second": 14.458,
|
|
"step": 1672
|
|
},
|
|
{
|
|
"epoch": 0.26739883013011817,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.7466709449703193e-05,
|
|
"loss": 0.2146,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.26899048983327367,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 1.745066581100594e-05,
|
|
"loss": 0.2632,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.2705821495364291,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 1.743462217230868e-05,
|
|
"loss": 0.2104,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.27217380923958456,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 1.7418578533611427e-05,
|
|
"loss": 0.1836,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.27376546894274006,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 1.7402534894914166e-05,
|
|
"loss": 0.2251,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.2753571286458955,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.738649125621691e-05,
|
|
"loss": 0.1818,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.27694878834905096,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.7370447617519654e-05,
|
|
"loss": 0.2245,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.27854044805220646,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 1.73544039788224e-05,
|
|
"loss": 0.2051,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.2801321077553619,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.7338360340125142e-05,
|
|
"loss": 0.2498,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.28172376745851735,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.7322316701427884e-05,
|
|
"loss": 0.247,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.28331542716167285,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 1.730627306273063e-05,
|
|
"loss": 0.2004,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.2849070868648283,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.7290229424033372e-05,
|
|
"loss": 0.1759,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.28649874656798374,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 1.7274185785336115e-05,
|
|
"loss": 0.2298,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.28809040627113924,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 1.725814214663886e-05,
|
|
"loss": 0.1822,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.2896820659742947,
|
|
"grad_norm": 9.25,
|
|
"learning_rate": 1.7242098507941603e-05,
|
|
"loss": 0.2617,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.29127372567745013,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.7226054869244345e-05,
|
|
"loss": 0.179,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.29286538538060564,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 1.721001123054709e-05,
|
|
"loss": 0.3092,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.2944570450837611,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 1.7193967591849833e-05,
|
|
"loss": 0.193,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.2960487047869166,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 1.7177923953152575e-05,
|
|
"loss": 0.2199,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.29764036449007203,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.7161880314455318e-05,
|
|
"loss": 0.2264,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.2992320241932275,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 1.7145836675758064e-05,
|
|
"loss": 0.2393,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.29939119016354304,
|
|
"eval_loss": 0.2013416439294815,
|
|
"eval_runtime": 17.2549,
|
|
"eval_samples_per_second": 14.72,
|
|
"eval_steps_per_second": 14.72,
|
|
"step": 1881
|
|
},
|
|
{
|
|
"epoch": 0.300823683896383,
|
|
"grad_norm": 7.84375,
|
|
"learning_rate": 1.7129793037060806e-05,
|
|
"loss": 0.1833,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.3024153435995384,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 1.711374939836355e-05,
|
|
"loss": 0.1988,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.30400700330269387,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 1.7097705759666294e-05,
|
|
"loss": 0.1939,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.30559866300584937,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 1.708166212096904e-05,
|
|
"loss": 0.2333,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.3071903227090048,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.706561848227178e-05,
|
|
"loss": 0.2026,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.30878198241216026,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 1.7049574843574524e-05,
|
|
"loss": 0.1765,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.31037364211531576,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.7033531204877267e-05,
|
|
"loss": 0.2231,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.3119653018184712,
|
|
"grad_norm": 6.75,
|
|
"learning_rate": 1.7017487566180012e-05,
|
|
"loss": 0.2602,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.31355696152162665,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 1.7001443927482755e-05,
|
|
"loss": 0.1873,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.31514862122478216,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 1.6985400288785497e-05,
|
|
"loss": 0.1701,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.3167402809279376,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 1.6969356650088243e-05,
|
|
"loss": 0.1989,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.31833194063109305,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 1.6953313011390985e-05,
|
|
"loss": 0.1837,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.31992360033424855,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.6937269372693727e-05,
|
|
"loss": 0.2641,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.321515260037404,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 1.692122573399647e-05,
|
|
"loss": 0.1754,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.3231069197405595,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.6905182095299216e-05,
|
|
"loss": 0.1997,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.32469857944371494,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 1.6889138456601958e-05,
|
|
"loss": 0.208,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.3262902391468704,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.6873094817904704e-05,
|
|
"loss": 0.1811,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.3278818988500259,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 1.6857051179207446e-05,
|
|
"loss": 0.2159,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.32947355855318133,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.6841007540510188e-05,
|
|
"loss": 0.2213,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.3310652182563368,
|
|
"grad_norm": 6.28125,
|
|
"learning_rate": 1.682496390181293e-05,
|
|
"loss": 0.248,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.3326568779594923,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 1.6808920263115676e-05,
|
|
"loss": 0.1793,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.3326568779594923,
|
|
"eval_loss": 0.19450008869171143,
|
|
"eval_runtime": 20.1367,
|
|
"eval_samples_per_second": 12.614,
|
|
"eval_steps_per_second": 12.614,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.3342485376626477,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 1.679287662441842e-05,
|
|
"loss": 0.1662,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.3358401973658032,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 1.6776832985721164e-05,
|
|
"loss": 0.1904,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.3374318570689587,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 1.6760789347023907e-05,
|
|
"loss": 0.1646,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.3390235167721141,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 1.674474570832665e-05,
|
|
"loss": 0.1964,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.34061517647526957,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 1.672870206962939e-05,
|
|
"loss": 0.2413,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.34220683617842507,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 1.6712658430932137e-05,
|
|
"loss": 0.1693,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.3437984958815805,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 1.669661479223488e-05,
|
|
"loss": 0.2391,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.34539015558473596,
|
|
"grad_norm": 12.25,
|
|
"learning_rate": 1.6680571153537625e-05,
|
|
"loss": 0.1936,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.34698181528789146,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.6664527514840368e-05,
|
|
"loss": 0.2358,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.3485734749910469,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.664848387614311e-05,
|
|
"loss": 0.2573,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.3501651346942024,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.6632440237445856e-05,
|
|
"loss": 0.1748,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.35175679439735785,
|
|
"grad_norm": 7.46875,
|
|
"learning_rate": 1.6616396598748598e-05,
|
|
"loss": 0.1858,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.3533484541005133,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 1.660035296005134e-05,
|
|
"loss": 0.1854,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.3549401138036688,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.6584309321354083e-05,
|
|
"loss": 0.1961,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.35653177350682425,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.656826568265683e-05,
|
|
"loss": 0.197,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.3581234332099797,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 1.655222204395957e-05,
|
|
"loss": 0.2126,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.3597150929131352,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 1.6536178405262316e-05,
|
|
"loss": 0.221,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.36130675261629064,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.652013476656506e-05,
|
|
"loss": 0.1927,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.3628984123194461,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 1.6504091127867805e-05,
|
|
"loss": 0.2067,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.3644900720226016,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 1.6488047489170543e-05,
|
|
"loss": 0.1315,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.36592256575544146,
|
|
"eval_loss": 0.19283312559127808,
|
|
"eval_runtime": 20.2327,
|
|
"eval_samples_per_second": 12.554,
|
|
"eval_steps_per_second": 12.554,
|
|
"step": 2299
|
|
},
|
|
{
|
|
"epoch": 0.36608173172575703,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 1.647200385047329e-05,
|
|
"loss": 0.239,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.3676733914289125,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 1.645596021177603e-05,
|
|
"loss": 0.2119,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.369265051132068,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 1.6439916573078777e-05,
|
|
"loss": 0.2435,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.3708567108352234,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.642387293438152e-05,
|
|
"loss": 0.1718,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.37244837053837887,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 1.6407829295684262e-05,
|
|
"loss": 0.179,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.37404003024153437,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 1.6391785656987008e-05,
|
|
"loss": 0.1719,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.3756316899446898,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 1.637574201828975e-05,
|
|
"loss": 0.2554,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.37722334964784526,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.6359698379592492e-05,
|
|
"loss": 0.1946,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.37881500935100076,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.6343654740895235e-05,
|
|
"loss": 0.1726,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.3804066690541562,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 1.632761110219798e-05,
|
|
"loss": 0.1832,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.3819983287573117,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.6311567463500723e-05,
|
|
"loss": 0.1999,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.38358998846046716,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 1.629552382480347e-05,
|
|
"loss": 0.1524,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.3851816481636226,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.627948018610621e-05,
|
|
"loss": 0.2557,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.3867733078667781,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 1.6263436547408953e-05,
|
|
"loss": 0.1527,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.38836496756993355,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 1.6247392908711695e-05,
|
|
"loss": 0.2068,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.389956627273089,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.623134927001444e-05,
|
|
"loss": 0.204,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.3915482869762445,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.6215305631317184e-05,
|
|
"loss": 0.202,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.39313994667939994,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 1.619926199261993e-05,
|
|
"loss": 0.1895,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.3947316063825554,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 1.618321835392267e-05,
|
|
"loss": 0.2701,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.3963232660857109,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.6167174715225414e-05,
|
|
"loss": 0.2134,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.39791492578886634,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.6151131076528156e-05,
|
|
"loss": 0.1489,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.3991882535513907,
|
|
"eval_loss": 0.19459104537963867,
|
|
"eval_runtime": 20.0247,
|
|
"eval_samples_per_second": 12.684,
|
|
"eval_steps_per_second": 12.684,
|
|
"step": 2508
|
|
},
|
|
{
|
|
"epoch": 0.3995065854920218,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 1.6135087437830902e-05,
|
|
"loss": 0.1837,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.4010982451951773,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 1.6119043799133644e-05,
|
|
"loss": 0.1907,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.40268990489833273,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 1.6103000160436387e-05,
|
|
"loss": 0.2121,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.4042815646014882,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.6086956521739132e-05,
|
|
"loss": 0.2161,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.4058732243046437,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.6070912883041875e-05,
|
|
"loss": 0.2073,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.4074648840077991,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 1.605486924434462e-05,
|
|
"loss": 0.2019,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.4090565437109546,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 1.6038825605647363e-05,
|
|
"loss": 0.2403,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.41064820341411007,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.6022781966950105e-05,
|
|
"loss": 0.2638,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.4122398631172655,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 1.6006738328252847e-05,
|
|
"loss": 0.1545,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.413831522820421,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 1.5990694689555593e-05,
|
|
"loss": 0.1785,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.41542318252357646,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 1.5974651050858336e-05,
|
|
"loss": 0.2051,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.4170148422267319,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.595860741216108e-05,
|
|
"loss": 0.199,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.4186065019298874,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 1.5942563773463824e-05,
|
|
"loss": 0.2658,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.42019816163304285,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 1.5926520134766566e-05,
|
|
"loss": 0.1834,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.4217898213361983,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.5910476496069308e-05,
|
|
"loss": 0.1773,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.4233814810393538,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 1.5894432857372054e-05,
|
|
"loss": 0.1712,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.42497314074250925,
|
|
"grad_norm": 7.21875,
|
|
"learning_rate": 1.5878389218674796e-05,
|
|
"loss": 0.2713,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.4265648004456647,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.5862345579977542e-05,
|
|
"loss": 0.1936,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.4281564601488202,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.5846301941280284e-05,
|
|
"loss": 0.2862,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.42974811985197564,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.5830258302583027e-05,
|
|
"loss": 0.2374,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.4313397795551311,
|
|
"grad_norm": 5.40625,
|
|
"learning_rate": 1.581421466388577e-05,
|
|
"loss": 0.1734,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.43245394134733994,
|
|
"eval_loss": 0.18887650966644287,
|
|
"eval_runtime": 20.0303,
|
|
"eval_samples_per_second": 12.681,
|
|
"eval_steps_per_second": 12.681,
|
|
"step": 2717
|
|
},
|
|
{
|
|
"epoch": 0.4329314392582866,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.5798171025188515e-05,
|
|
"loss": 0.2102,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.43452309896144203,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.5782127386491257e-05,
|
|
"loss": 0.2623,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.43611475866459753,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 1.5766083747794e-05,
|
|
"loss": 0.2069,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.437706418367753,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 1.5750040109096745e-05,
|
|
"loss": 0.2135,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.4392980780709084,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 1.5733996470399488e-05,
|
|
"loss": 0.1815,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.44088973777406393,
|
|
"grad_norm": 7.8125,
|
|
"learning_rate": 1.5717952831702233e-05,
|
|
"loss": 0.2517,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.4424813974772194,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 1.5701909193004972e-05,
|
|
"loss": 0.2244,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.4440730571803748,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 1.5685865554307718e-05,
|
|
"loss": 0.2754,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.4456647168835303,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 1.566982191561046e-05,
|
|
"loss": 0.2041,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.44725637658668577,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.5653778276913206e-05,
|
|
"loss": 0.1895,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.4488480362898412,
|
|
"grad_norm": 6.375,
|
|
"learning_rate": 1.563773463821595e-05,
|
|
"loss": 0.2331,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.4504396959929967,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 1.5621690999518694e-05,
|
|
"loss": 0.1776,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.45203135569615216,
|
|
"grad_norm": 7.3125,
|
|
"learning_rate": 1.5605647360821436e-05,
|
|
"loss": 0.1953,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.4536230153993076,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.558960372212418e-05,
|
|
"loss": 0.2038,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.4552146751024631,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 1.557356008342692e-05,
|
|
"loss": 0.1951,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.45680633480561855,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.5557516444729667e-05,
|
|
"loss": 0.2458,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.458397994508774,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 1.554147280603241e-05,
|
|
"loss": 0.2819,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.4599896542119295,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 1.552542916733515e-05,
|
|
"loss": 0.2086,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.46158131391508495,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.5509385528637897e-05,
|
|
"loss": 0.1607,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.46317297361824045,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 1.549334188994064e-05,
|
|
"loss": 0.2651,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.4647646333213959,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.5477298251243382e-05,
|
|
"loss": 0.219,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.4657196291432892,
|
|
"eval_loss": 0.1854693740606308,
|
|
"eval_runtime": 17.4236,
|
|
"eval_samples_per_second": 14.578,
|
|
"eval_steps_per_second": 14.578,
|
|
"step": 2926
|
|
},
|
|
{
|
|
"epoch": 0.46635629302455134,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 1.5461254612546128e-05,
|
|
"loss": 0.1906,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.46794795272770684,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 1.544521097384887e-05,
|
|
"loss": 0.1632,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.4695396124308623,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 1.5429167335151612e-05,
|
|
"loss": 0.1724,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.47113127213401773,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.5413123696454358e-05,
|
|
"loss": 0.1659,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.47272293183717323,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 1.53970800577571e-05,
|
|
"loss": 0.1916,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.4743145915403287,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.5381036419059846e-05,
|
|
"loss": 0.2018,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.4759062512434841,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.5364992780362585e-05,
|
|
"loss": 0.169,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.4774979109466396,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.534894914166533e-05,
|
|
"loss": 0.2141,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.47908957064979507,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 1.5332905502968073e-05,
|
|
"loss": 0.1759,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.4806812303529505,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 1.531686186427082e-05,
|
|
"loss": 0.2285,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.482272890056106,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.530081822557356e-05,
|
|
"loss": 0.184,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.48386454975926146,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 1.5284774586876307e-05,
|
|
"loss": 0.198,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.4854562094624169,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 1.526873094817905e-05,
|
|
"loss": 0.1873,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.4870478691655724,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.525268730948179e-05,
|
|
"loss": 0.2076,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.48863952886872786,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.5236643670784534e-05,
|
|
"loss": 0.1736,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.49023118857188336,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.5220600032087278e-05,
|
|
"loss": 0.2033,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.4918228482750388,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 1.5204556393390022e-05,
|
|
"loss": 0.2062,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.49341450797819425,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.5188512754692766e-05,
|
|
"loss": 0.1701,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.49500616768134975,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 1.517246911599551e-05,
|
|
"loss": 0.1874,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.4965978273845052,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.5156425477298254e-05,
|
|
"loss": 0.1446,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.49818948708766064,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.5140381838600995e-05,
|
|
"loss": 0.1705,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.49898531693923837,
|
|
"eval_loss": 0.1869022697210312,
|
|
"eval_runtime": 17.377,
|
|
"eval_samples_per_second": 14.617,
|
|
"eval_steps_per_second": 14.617,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 0.49978114679081614,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 1.5124338199903739e-05,
|
|
"loss": 0.1662,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.5013728064939715,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 1.5108294561206483e-05,
|
|
"loss": 0.2257,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.5029644661971271,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 1.5092250922509227e-05,
|
|
"loss": 0.1627,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.5045561259002825,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.507620728381197e-05,
|
|
"loss": 0.3629,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.506147785603438,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 1.5060163645114713e-05,
|
|
"loss": 0.1946,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.5077394453065934,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 1.5044120006417457e-05,
|
|
"loss": 0.1649,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.5093311050097489,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 1.50280763677202e-05,
|
|
"loss": 0.2218,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.5109227647129044,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 1.5012032729022944e-05,
|
|
"loss": 0.1767,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.5125144244160599,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 1.4995989090325686e-05,
|
|
"loss": 0.1505,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.5141060841192153,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 1.497994545162843e-05,
|
|
"loss": 0.1781,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.5156977438223708,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 1.4963901812931174e-05,
|
|
"loss": 0.1779,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.5172894035255262,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 1.4947858174233918e-05,
|
|
"loss": 0.1561,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.5188810632286817,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 1.4931814535536662e-05,
|
|
"loss": 0.153,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.5204727229318372,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.4915770896839406e-05,
|
|
"loss": 0.2017,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.5220643826349927,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.4899727258142147e-05,
|
|
"loss": 0.2114,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.5236560423381481,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 1.488368361944489e-05,
|
|
"loss": 0.2078,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.5252477020413036,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.4867639980747635e-05,
|
|
"loss": 0.1939,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.526839361744459,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.4851596342050379e-05,
|
|
"loss": 0.222,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.5284310214476144,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.4835552703353123e-05,
|
|
"loss": 0.1794,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.53002268115077,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.4819509064655865e-05,
|
|
"loss": 0.2427,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.5316143408539254,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 1.480346542595861e-05,
|
|
"loss": 0.192,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.5322510047351876,
|
|
"eval_loss": 0.18332095444202423,
|
|
"eval_runtime": 17.0952,
|
|
"eval_samples_per_second": 14.858,
|
|
"eval_steps_per_second": 14.858,
|
|
"step": 3344
|
|
},
|
|
{
|
|
"epoch": 0.5332060005570809,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 1.4787421787261352e-05,
|
|
"loss": 0.152,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.5347976602602363,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.4771378148564096e-05,
|
|
"loss": 0.1599,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.5363893199633918,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 1.4755334509866838e-05,
|
|
"loss": 0.1821,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.5379809796665473,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.4739290871169582e-05,
|
|
"loss": 0.1775,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.5395726393697028,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 1.4723247232472326e-05,
|
|
"loss": 0.1712,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.5411642990728582,
|
|
"grad_norm": 6.6875,
|
|
"learning_rate": 1.470720359377507e-05,
|
|
"loss": 0.2063,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.5427559587760137,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 1.4691159955077814e-05,
|
|
"loss": 0.1776,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.5443476184791691,
|
|
"grad_norm": 4.71875,
|
|
"learning_rate": 1.4675116316380555e-05,
|
|
"loss": 0.2757,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.5459392781823246,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 1.4659072677683299e-05,
|
|
"loss": 0.2126,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.5475309378854801,
|
|
"grad_norm": 5.5625,
|
|
"learning_rate": 1.4643029038986043e-05,
|
|
"loss": 0.2126,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.5491225975886356,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 1.4626985400288787e-05,
|
|
"loss": 0.1764,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.550714257291791,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.4610941761591531e-05,
|
|
"loss": 0.1633,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.5523059169949465,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 1.4594898122894275e-05,
|
|
"loss": 0.1518,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.5538975766981019,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.4578854484197017e-05,
|
|
"loss": 0.2246,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.5554892364012574,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.456281084549976e-05,
|
|
"loss": 0.2147,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.5570808961044129,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 1.4546767206802504e-05,
|
|
"loss": 0.1718,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.5586725558075684,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 1.4530723568105248e-05,
|
|
"loss": 0.1885,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.5602642155107238,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 1.4514679929407992e-05,
|
|
"loss": 0.1278,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.5618558752138793,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.4498636290710734e-05,
|
|
"loss": 0.2075,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.5634475349170347,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 1.4482592652013478e-05,
|
|
"loss": 0.1832,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.5650391946201903,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 1.4466549013316222e-05,
|
|
"loss": 0.2203,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.5655166925311368,
|
|
"eval_loss": 0.17973919212818146,
|
|
"eval_runtime": 17.1013,
|
|
"eval_samples_per_second": 14.853,
|
|
"eval_steps_per_second": 14.853,
|
|
"step": 3553
|
|
},
|
|
{
|
|
"epoch": 0.5666308543233457,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 1.4450505374618964e-05,
|
|
"loss": 0.1594,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.5682225140265011,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 1.4434461735921708e-05,
|
|
"loss": 0.2293,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.5698141737296566,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 1.441841809722445e-05,
|
|
"loss": 0.2221,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.571405833432812,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 1.4402374458527195e-05,
|
|
"loss": 0.1922,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.5729974931359675,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 1.4386330819829939e-05,
|
|
"loss": 0.2405,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.574589152839123,
|
|
"grad_norm": 6.8125,
|
|
"learning_rate": 1.4370287181132683e-05,
|
|
"loss": 0.2046,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.5761808125422785,
|
|
"grad_norm": 6.84375,
|
|
"learning_rate": 1.4354243542435427e-05,
|
|
"loss": 0.1706,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.5777724722454339,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 1.4338199903738168e-05,
|
|
"loss": 0.1786,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.5793641319485894,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 1.4322156265040912e-05,
|
|
"loss": 0.1967,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.5809557916517448,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.4306112626343656e-05,
|
|
"loss": 0.1717,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.5825474513549003,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.42900689876464e-05,
|
|
"loss": 0.1842,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.5841391110580558,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.4274025348949144e-05,
|
|
"loss": 0.2265,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.5857307707612113,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 1.4257981710251888e-05,
|
|
"loss": 0.1809,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.5873224304643667,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.424193807155463e-05,
|
|
"loss": 0.1774,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.5889140901675222,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.4225894432857372e-05,
|
|
"loss": 0.2663,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.5905057498706776,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.4209850794160116e-05,
|
|
"loss": 0.1994,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.5920974095738332,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 1.419380715546286e-05,
|
|
"loss": 0.1795,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.5936890692769886,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.4177763516765603e-05,
|
|
"loss": 0.1972,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.5952807289801441,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 1.4161719878068347e-05,
|
|
"loss": 0.171,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.5968723886832995,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.4145676239371091e-05,
|
|
"loss": 0.264,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.598464048386455,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 1.4129632600673835e-05,
|
|
"loss": 0.2137,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.5987823803270861,
|
|
"eval_loss": 0.17893706262111664,
|
|
"eval_runtime": 17.221,
|
|
"eval_samples_per_second": 14.749,
|
|
"eval_steps_per_second": 14.749,
|
|
"step": 3762
|
|
},
|
|
{
|
|
"epoch": 0.6000557080896104,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.4113588961976577e-05,
|
|
"loss": 0.2115,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.601647367792766,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.409754532327932e-05,
|
|
"loss": 0.175,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.6032390274959214,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.4081501684582064e-05,
|
|
"loss": 0.2347,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.6048306871990768,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 1.4065458045884808e-05,
|
|
"loss": 0.2158,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.6064223469022323,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.4049414407187552e-05,
|
|
"loss": 0.1754,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.6080140066053877,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.4033370768490296e-05,
|
|
"loss": 0.1774,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.6096056663085432,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 1.401732712979304e-05,
|
|
"loss": 0.2212,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.6111973260116987,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 1.400128349109578e-05,
|
|
"loss": 0.1999,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.6127889857148542,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.3985239852398524e-05,
|
|
"loss": 0.1336,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.6143806454180096,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 1.3969196213701268e-05,
|
|
"loss": 0.2271,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.6159723051211651,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 1.3953152575004013e-05,
|
|
"loss": 0.2023,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.6175639648243205,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 1.3937108936306757e-05,
|
|
"loss": 0.1738,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.6191556245274761,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.3921065297609499e-05,
|
|
"loss": 0.1744,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.6207472842306315,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 1.3905021658912243e-05,
|
|
"loss": 0.2376,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.622338943933787,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 1.3888978020214985e-05,
|
|
"loss": 0.1982,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.6239306036369424,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.387293438151773e-05,
|
|
"loss": 0.2116,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.6255222633400979,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.3856890742820472e-05,
|
|
"loss": 0.1988,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.6271139230432533,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.3840847104123216e-05,
|
|
"loss": 0.1777,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.6287055827464089,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 1.382480346542596e-05,
|
|
"loss": 0.1629,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.6302972424495643,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.3808759826728704e-05,
|
|
"loss": 0.1901,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.6318889021527198,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.3792716188031448e-05,
|
|
"loss": 0.1782,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.6320480681230353,
|
|
"eval_loss": 0.1779349446296692,
|
|
"eval_runtime": 17.4478,
|
|
"eval_samples_per_second": 14.558,
|
|
"eval_steps_per_second": 14.558,
|
|
"step": 3971
|
|
},
|
|
{
|
|
"epoch": 0.6334805618558752,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 1.3776672549334188e-05,
|
|
"loss": 0.1732,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.6350722215590306,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.3760628910636932e-05,
|
|
"loss": 0.1733,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.6366638812621861,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 1.3744585271939676e-05,
|
|
"loss": 0.2284,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.6382555409653417,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 1.372854163324242e-05,
|
|
"loss": 0.1628,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.6398472006684971,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 1.3712497994545165e-05,
|
|
"loss": 0.2243,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.6414388603716525,
|
|
"grad_norm": 6.90625,
|
|
"learning_rate": 1.3696454355847909e-05,
|
|
"loss": 0.2358,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.643030520074808,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 1.3680410717150651e-05,
|
|
"loss": 0.2164,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.6446221797779634,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 1.3664367078453393e-05,
|
|
"loss": 0.1866,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.646213839481119,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 1.3648323439756137e-05,
|
|
"loss": 0.137,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.6478054991842744,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 1.3632279801058881e-05,
|
|
"loss": 0.1813,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.6493971588874299,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 1.3616236162361625e-05,
|
|
"loss": 0.2044,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.6509888185905853,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.3600192523664368e-05,
|
|
"loss": 0.1777,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.6525804782937408,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.3584148884967112e-05,
|
|
"loss": 0.1958,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.6541721379968962,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 1.3568105246269856e-05,
|
|
"loss": 0.1238,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.6557637977000518,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.3552061607572598e-05,
|
|
"loss": 0.2422,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.6573554574032072,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.3536017968875342e-05,
|
|
"loss": 0.202,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.6589471171063627,
|
|
"grad_norm": 5.96875,
|
|
"learning_rate": 1.3519974330178084e-05,
|
|
"loss": 0.2114,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.6605387768095181,
|
|
"grad_norm": 7.84375,
|
|
"learning_rate": 1.3503930691480828e-05,
|
|
"loss": 0.2146,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.6621304365126736,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.3487887052783573e-05,
|
|
"loss": 0.1636,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.663722096215829,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.3471843414086317e-05,
|
|
"loss": 0.2212,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.6653137559189846,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 1.345579977538906e-05,
|
|
"loss": 0.1815,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.6653137559189846,
|
|
"eval_loss": 0.1762889325618744,
|
|
"eval_runtime": 17.406,
|
|
"eval_samples_per_second": 14.593,
|
|
"eval_steps_per_second": 14.593,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.66690541562214,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 1.3439756136691805e-05,
|
|
"loss": 0.239,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.6684970753252955,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 1.3423712497994545e-05,
|
|
"loss": 0.1875,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.6700887350284509,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 1.340766885929729e-05,
|
|
"loss": 0.1538,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.6716803947316063,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 1.3391625220600033e-05,
|
|
"loss": 0.1764,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.6732720544347619,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 1.3375581581902777e-05,
|
|
"loss": 0.1867,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.6748637141379173,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 1.3359537943205521e-05,
|
|
"loss": 0.1994,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.6764553738410728,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.3343494304508264e-05,
|
|
"loss": 0.2228,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.6780470335442282,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.3327450665811008e-05,
|
|
"loss": 0.1771,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.6796386932473837,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.331140702711375e-05,
|
|
"loss": 0.2166,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.6812303529505391,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.3295363388416494e-05,
|
|
"loss": 0.2073,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.6828220126536947,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.3279319749719236e-05,
|
|
"loss": 0.179,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.6844136723568501,
|
|
"grad_norm": 8.375,
|
|
"learning_rate": 1.326327611102198e-05,
|
|
"loss": 0.255,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.6860053320600056,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 1.3247232472324725e-05,
|
|
"loss": 0.2664,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.687596991763161,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 1.3231188833627469e-05,
|
|
"loss": 0.1805,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.6891886514663165,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 1.3215145194930213e-05,
|
|
"loss": 0.2434,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.6907803111694719,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.3199101556232953e-05,
|
|
"loss": 0.2115,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.6923719708726275,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.3183057917535697e-05,
|
|
"loss": 0.1341,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.6939636305757829,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.3167014278838441e-05,
|
|
"loss": 0.2175,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.6955552902789384,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 1.3150970640141185e-05,
|
|
"loss": 0.1811,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.6971469499820938,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.313492700144393e-05,
|
|
"loss": 0.2267,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.6985794437149337,
|
|
"eval_loss": 0.17379696667194366,
|
|
"eval_runtime": 17.3632,
|
|
"eval_samples_per_second": 14.629,
|
|
"eval_steps_per_second": 14.629,
|
|
"step": 4389
|
|
},
|
|
{
|
|
"epoch": 0.6987386096852493,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 1.3118883362746673e-05,
|
|
"loss": 0.1974,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.7003302693884048,
|
|
"grad_norm": 6.46875,
|
|
"learning_rate": 1.3102839724049416e-05,
|
|
"loss": 0.2249,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.7019219290915603,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.3086796085352158e-05,
|
|
"loss": 0.1432,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.7035135887947157,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.3070752446654902e-05,
|
|
"loss": 0.2116,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.7051052484978712,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 1.3054708807957646e-05,
|
|
"loss": 0.2042,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.7066969082010266,
|
|
"grad_norm": 7.21875,
|
|
"learning_rate": 1.303866516926039e-05,
|
|
"loss": 0.1804,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.708288567904182,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 1.3022621530563133e-05,
|
|
"loss": 0.23,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.7098802276073376,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.3006577891865877e-05,
|
|
"loss": 0.1513,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.711471887310493,
|
|
"grad_norm": 6.28125,
|
|
"learning_rate": 1.299053425316862e-05,
|
|
"loss": 0.1837,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.7130635470136485,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.2974490614471363e-05,
|
|
"loss": 0.2039,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.7146552067168039,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 1.2958446975774105e-05,
|
|
"loss": 0.227,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.7162468664199594,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.294240333707685e-05,
|
|
"loss": 0.2172,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.7178385261231148,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 1.2926359698379593e-05,
|
|
"loss": 0.2052,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.7194301858262704,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 1.2910316059682337e-05,
|
|
"loss": 0.1807,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.7210218455294258,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.2894272420985081e-05,
|
|
"loss": 0.1674,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.7226135052325813,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 1.2878228782287825e-05,
|
|
"loss": 0.2172,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.7242051649357367,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 1.2862185143590566e-05,
|
|
"loss": 0.1861,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.7257968246388922,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 1.284614150489331e-05,
|
|
"loss": 0.1741,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.7273884843420477,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 1.2830097866196054e-05,
|
|
"loss": 0.238,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.7289801440452032,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 1.2814054227498798e-05,
|
|
"loss": 0.2173,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.7305718037483586,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.2798010588801542e-05,
|
|
"loss": 0.1342,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.7318451315108829,
|
|
"eval_loss": 0.1735742837190628,
|
|
"eval_runtime": 17.2302,
|
|
"eval_samples_per_second": 14.742,
|
|
"eval_steps_per_second": 14.742,
|
|
"step": 4598
|
|
},
|
|
{
|
|
"epoch": 0.7321634634515141,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 1.2781966950104285e-05,
|
|
"loss": 0.2025,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.7337551231546695,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 1.2765923311407029e-05,
|
|
"loss": 0.2009,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.735346782857825,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.2749879672709771e-05,
|
|
"loss": 0.1313,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.7369384425609805,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 1.2733836034012515e-05,
|
|
"loss": 0.1989,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.738530102264136,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 1.2717792395315259e-05,
|
|
"loss": 0.1405,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.7401217619672914,
|
|
"grad_norm": 8.5,
|
|
"learning_rate": 1.2701748756618001e-05,
|
|
"loss": 0.2711,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.7417134216704468,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 1.2685705117920745e-05,
|
|
"loss": 0.2044,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.7433050813736023,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 1.266966147922349e-05,
|
|
"loss": 0.1699,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.7448967410767577,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 1.2653617840526233e-05,
|
|
"loss": 0.1815,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.7464884007799133,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.2637574201828974e-05,
|
|
"loss": 0.2117,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.7480800604830687,
|
|
"grad_norm": 4.71875,
|
|
"learning_rate": 1.2621530563131718e-05,
|
|
"loss": 0.1652,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.7496717201862242,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.2605486924434462e-05,
|
|
"loss": 0.1588,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.7512633798893796,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 1.2589443285737206e-05,
|
|
"loss": 0.19,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.7528550395925351,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.257339964703995e-05,
|
|
"loss": 0.1849,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.7544466992956905,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 1.2557356008342694e-05,
|
|
"loss": 0.18,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.7560383589988461,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.2541312369645438e-05,
|
|
"loss": 0.2441,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.7576300187020015,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 1.2525268730948179e-05,
|
|
"loss": 0.2301,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.759221678405157,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 1.2509225092250923e-05,
|
|
"loss": 0.2008,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.7608133381083124,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.2493181453553667e-05,
|
|
"loss": 0.2038,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.7624049978114679,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 1.2477137814856411e-05,
|
|
"loss": 0.2088,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.7639966575146234,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.2461094176159155e-05,
|
|
"loss": 0.1927,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.7651108193068322,
|
|
"eval_loss": 0.1735478788614273,
|
|
"eval_runtime": 17.5023,
|
|
"eval_samples_per_second": 14.512,
|
|
"eval_steps_per_second": 14.512,
|
|
"step": 4807
|
|
},
|
|
{
|
|
"epoch": 0.7655883172177789,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 1.2445050537461897e-05,
|
|
"loss": 0.1859,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.7671799769209343,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 1.2429006898764641e-05,
|
|
"loss": 0.1558,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.7687716366240898,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 1.2412963260067384e-05,
|
|
"loss": 0.207,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.7703632963272452,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 1.2396919621370128e-05,
|
|
"loss": 0.2338,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.7719549560304007,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 1.238087598267287e-05,
|
|
"loss": 0.1807,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.7735466157335562,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.2364832343975614e-05,
|
|
"loss": 0.199,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.7751382754367117,
|
|
"grad_norm": 6.5625,
|
|
"learning_rate": 1.2348788705278358e-05,
|
|
"loss": 0.2278,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.7767299351398671,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 1.2332745066581102e-05,
|
|
"loss": 0.1716,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.7783215948430225,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.2316701427883846e-05,
|
|
"loss": 0.1959,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.779913254546178,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.2300657789186587e-05,
|
|
"loss": 0.1985,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.7815049142493334,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 1.2284614150489331e-05,
|
|
"loss": 0.2016,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.783096573952489,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 1.2268570511792075e-05,
|
|
"loss": 0.228,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.7846882336556444,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 1.2252526873094819e-05,
|
|
"loss": 0.1966,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.7862798933587999,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 1.2236483234397563e-05,
|
|
"loss": 0.2227,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.7878715530619553,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.2220439595700307e-05,
|
|
"loss": 0.1889,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.7894632127651108,
|
|
"grad_norm": 6.375,
|
|
"learning_rate": 1.220439595700305e-05,
|
|
"loss": 0.2274,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.7910548724682663,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 1.2188352318305792e-05,
|
|
"loss": 0.2096,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.7926465321714218,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 1.2172308679608536e-05,
|
|
"loss": 0.1755,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.7942381918745772,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 1.215626504091128e-05,
|
|
"loss": 0.212,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.7958298515777327,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.2140221402214024e-05,
|
|
"loss": 0.151,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.7974215112808881,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 1.2124177763516766e-05,
|
|
"loss": 0.185,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.7983765071027814,
|
|
"eval_loss": 0.16947615146636963,
|
|
"eval_runtime": 17.2898,
|
|
"eval_samples_per_second": 14.691,
|
|
"eval_steps_per_second": 14.691,
|
|
"step": 5016
|
|
},
|
|
{
|
|
"epoch": 0.7990131709840436,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.210813412481951e-05,
|
|
"loss": 0.2137,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.8006048306871991,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 1.2092090486122254e-05,
|
|
"loss": 0.2036,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.8021964903903546,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 1.2076046847424997e-05,
|
|
"loss": 0.1905,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.80378815009351,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 1.2060003208727739e-05,
|
|
"loss": 0.1611,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.8053798097966655,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 1.2043959570030483e-05,
|
|
"loss": 0.1875,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.8069714694998209,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 1.2027915931333227e-05,
|
|
"loss": 0.1819,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.8085631292029763,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.2011872292635971e-05,
|
|
"loss": 0.146,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.8101547889061319,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 1.1995828653938715e-05,
|
|
"loss": 0.2144,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.8117464486092874,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.1979785015241459e-05,
|
|
"loss": 0.2513,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.8133381083124428,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 1.1963741376544203e-05,
|
|
"loss": 0.1355,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.8149297680155982,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 1.1947697737846944e-05,
|
|
"loss": 0.1852,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.8165214277187537,
|
|
"grad_norm": 5.625,
|
|
"learning_rate": 1.1931654099149688e-05,
|
|
"loss": 0.1313,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.8181130874219092,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 1.1915610460452432e-05,
|
|
"loss": 0.1434,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.8197047471250647,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 1.1899566821755176e-05,
|
|
"loss": 0.1637,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.8212964068282201,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.1883523183057918e-05,
|
|
"loss": 0.2158,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.8228880665313756,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 1.1867479544360662e-05,
|
|
"loss": 0.1998,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.824479726234531,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.1851435905663406e-05,
|
|
"loss": 0.1704,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.8260713859376865,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 1.1835392266966149e-05,
|
|
"loss": 0.1698,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.827663045640842,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 1.1819348628268893e-05,
|
|
"loss": 0.2275,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.8292547053439975,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 1.1803304989571635e-05,
|
|
"loss": 0.1536,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.8308463650471529,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 1.1787261350874379e-05,
|
|
"loss": 0.1895,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.8316421948987307,
|
|
"eval_loss": 0.16963571310043335,
|
|
"eval_runtime": 17.226,
|
|
"eval_samples_per_second": 14.745,
|
|
"eval_steps_per_second": 14.745,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"epoch": 0.8324380247503084,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 1.1771217712177123e-05,
|
|
"loss": 0.1736,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.8340296844534638,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 1.1755174073479867e-05,
|
|
"loss": 0.1493,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.8356213441566193,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.1739130434782611e-05,
|
|
"loss": 0.1593,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.8372130038597748,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 1.1723086796085352e-05,
|
|
"loss": 0.1552,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.8388046635629303,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.1707043157388096e-05,
|
|
"loss": 0.1449,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.8403963232660857,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.169099951869084e-05,
|
|
"loss": 0.1442,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.8419879829692412,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.1674955879993584e-05,
|
|
"loss": 0.1582,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.8435796426723966,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.1658912241296328e-05,
|
|
"loss": 0.2484,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.8451713023755522,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.1642868602599072e-05,
|
|
"loss": 0.1716,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.8467629620787076,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 1.1626824963901814e-05,
|
|
"loss": 0.2224,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.848354621781863,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 1.1610781325204557e-05,
|
|
"loss": 0.1811,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.8499462814850185,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 1.15947376865073e-05,
|
|
"loss": 0.1465,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.8515379411881739,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.1578694047810045e-05,
|
|
"loss": 0.2368,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.8531296008913294,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 1.1562650409112789e-05,
|
|
"loss": 0.1873,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.8547212605944849,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 1.1546606770415531e-05,
|
|
"loss": 0.1708,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.8563129202976404,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 1.1530563131718275e-05,
|
|
"loss": 0.1947,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.8579045800007958,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.1514519493021019e-05,
|
|
"loss": 0.2159,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.8594962397039513,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 1.1498475854323761e-05,
|
|
"loss": 0.1845,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.8610878994071067,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.1482432215626504e-05,
|
|
"loss": 0.183,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.8626795591102622,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.1466388576929248e-05,
|
|
"loss": 0.1899,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.8642712188134177,
|
|
"grad_norm": 7.59375,
|
|
"learning_rate": 1.1450344938231992e-05,
|
|
"loss": 0.16,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.8649078826946799,
|
|
"eval_loss": 0.1669895052909851,
|
|
"eval_runtime": 17.1229,
|
|
"eval_samples_per_second": 14.834,
|
|
"eval_steps_per_second": 14.834,
|
|
"step": 5434
|
|
},
|
|
{
|
|
"epoch": 0.8658628785165732,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.1434301299534736e-05,
|
|
"loss": 0.1487,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.8674545382197286,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 1.141825766083748e-05,
|
|
"loss": 0.1371,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.8690461979228841,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 1.1402214022140224e-05,
|
|
"loss": 0.161,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.8706378576260395,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.1386170383442965e-05,
|
|
"loss": 0.2296,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.8722295173291951,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 1.1370126744745709e-05,
|
|
"loss": 0.2084,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.8738211770323505,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.1354083106048453e-05,
|
|
"loss": 0.1496,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.875412836735506,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.1338039467351197e-05,
|
|
"loss": 0.1789,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.8770044964386614,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 1.132199582865394e-05,
|
|
"loss": 0.1899,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.8785961561418169,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 1.1305952189956683e-05,
|
|
"loss": 0.1897,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.8801878158449723,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.1289908551259427e-05,
|
|
"loss": 0.1808,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.8817794755481279,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 1.127386491256217e-05,
|
|
"loss": 0.2111,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.8833711352512833,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 1.1257821273864913e-05,
|
|
"loss": 0.1603,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.8849627949544387,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.1241777635167657e-05,
|
|
"loss": 0.1687,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.8865544546575942,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.12257339964704e-05,
|
|
"loss": 0.1873,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.8881461143607496,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.1209690357773144e-05,
|
|
"loss": 0.2335,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.8897377740639051,
|
|
"grad_norm": 11.1875,
|
|
"learning_rate": 1.1193646719075888e-05,
|
|
"loss": 0.2463,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.8913294337670606,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.1177603080378632e-05,
|
|
"loss": 0.1406,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.8929210934702161,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 1.1161559441681373e-05,
|
|
"loss": 0.1538,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.8945127531733715,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 1.1145515802984117e-05,
|
|
"loss": 0.1828,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.896104412876527,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 1.112947216428686e-05,
|
|
"loss": 0.1098,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.8976960725796824,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 1.1113428525589605e-05,
|
|
"loss": 0.2177,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.8981735704906291,
|
|
"eval_loss": 0.16581162810325623,
|
|
"eval_runtime": 17.0424,
|
|
"eval_samples_per_second": 14.904,
|
|
"eval_steps_per_second": 14.904,
|
|
"step": 5643
|
|
},
|
|
{
|
|
"epoch": 0.899287732282838,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.1097384886892349e-05,
|
|
"loss": 0.2013,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.9008793919859934,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 1.1081341248195093e-05,
|
|
"loss": 0.2335,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.9024710516891489,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 1.1065297609497837e-05,
|
|
"loss": 0.178,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.9040627113923043,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 1.1049253970800577e-05,
|
|
"loss": 0.1254,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.9056543710954598,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.1033210332103321e-05,
|
|
"loss": 0.2078,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.9072460307986152,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.1017166693406065e-05,
|
|
"loss": 0.1577,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.9088376905017708,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.100112305470881e-05,
|
|
"loss": 0.1744,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.9104293502049262,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.0985079416011552e-05,
|
|
"loss": 0.2146,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.9120210099080817,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.0969035777314296e-05,
|
|
"loss": 0.1712,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.9136126696112371,
|
|
"grad_norm": 11.5625,
|
|
"learning_rate": 1.095299213861704e-05,
|
|
"loss": 0.2017,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.9152043293143926,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 1.0936948499919782e-05,
|
|
"loss": 0.1906,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.916795989017548,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 1.0920904861222526e-05,
|
|
"loss": 0.2205,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.9183876487207036,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 1.0904861222525269e-05,
|
|
"loss": 0.1499,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.919979308423859,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 1.0888817583828013e-05,
|
|
"loss": 0.1933,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.9215709681270144,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 1.0872773945130757e-05,
|
|
"loss": 0.1519,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.9231626278301699,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 1.08567303064335e-05,
|
|
"loss": 0.1531,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.9247542875333253,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 1.0840686667736245e-05,
|
|
"loss": 0.135,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.9263459472364809,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 1.0824643029038985e-05,
|
|
"loss": 0.188,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.9279376069396363,
|
|
"grad_norm": 6.5625,
|
|
"learning_rate": 1.080859939034173e-05,
|
|
"loss": 0.2598,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.9295292666427918,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 1.0792555751644473e-05,
|
|
"loss": 0.164,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.9311209263459472,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.0776512112947217e-05,
|
|
"loss": 0.1643,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.9314392582865784,
|
|
"eval_loss": 0.16592954099178314,
|
|
"eval_runtime": 17.4273,
|
|
"eval_samples_per_second": 14.575,
|
|
"eval_steps_per_second": 14.575,
|
|
"step": 5852
|
|
},
|
|
{
|
|
"epoch": 0.9327125860491027,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 1.0760468474249961e-05,
|
|
"loss": 0.1734,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.9343042457522581,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 1.0744424835552705e-05,
|
|
"loss": 0.1852,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.9358959054554137,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.0728381196855448e-05,
|
|
"loss": 0.2208,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.9374875651585691,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.071233755815819e-05,
|
|
"loss": 0.1591,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.9390792248617246,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 1.0696293919460934e-05,
|
|
"loss": 0.1817,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.94067088456488,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 1.0680250280763678e-05,
|
|
"loss": 0.1772,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.9422625442680355,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 1.0664206642066422e-05,
|
|
"loss": 0.211,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.9438542039711909,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.0648163003369165e-05,
|
|
"loss": 0.2176,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.9454458636743465,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 1.0632119364671909e-05,
|
|
"loss": 0.1513,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.9470375233775019,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 1.0616075725974653e-05,
|
|
"loss": 0.1716,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.9486291830806574,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.0600032087277395e-05,
|
|
"loss": 0.1497,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.9502208427838128,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.0583988448580137e-05,
|
|
"loss": 0.1577,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.9518125024869682,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 1.0567944809882881e-05,
|
|
"loss": 0.1749,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.9534041621901238,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.0551901171185625e-05,
|
|
"loss": 0.1801,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.9549958218932793,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.053585753248837e-05,
|
|
"loss": 0.2042,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.9565874815964347,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.0519813893791113e-05,
|
|
"loss": 0.1737,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.9581791412995901,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.0503770255093858e-05,
|
|
"loss": 0.1679,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.9597708010027456,
|
|
"grad_norm": 8.25,
|
|
"learning_rate": 1.0487726616396598e-05,
|
|
"loss": 0.1744,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.961362460705901,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.0471682977699342e-05,
|
|
"loss": 0.1483,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.9629541204090566,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.0455639339002086e-05,
|
|
"loss": 0.1863,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.964545780112212,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 1.043959570030483e-05,
|
|
"loss": 0.2259,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.9647049460825275,
|
|
"eval_loss": 0.16700074076652527,
|
|
"eval_runtime": 17.3505,
|
|
"eval_samples_per_second": 14.639,
|
|
"eval_steps_per_second": 14.639,
|
|
"step": 6061
|
|
},
|
|
{
|
|
"epoch": 0.9661374398153675,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.0423552061607574e-05,
|
|
"loss": 0.1841,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.9677290995185229,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.0407508422910317e-05,
|
|
"loss": 0.1803,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.9693207592216784,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 1.039146478421306e-05,
|
|
"loss": 0.1683,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.9709124189248338,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.0375421145515805e-05,
|
|
"loss": 0.2083,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.9725040786279894,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 1.0359377506818547e-05,
|
|
"loss": 0.1896,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.9740957383311448,
|
|
"grad_norm": 6.40625,
|
|
"learning_rate": 1.0343333868121291e-05,
|
|
"loss": 0.186,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.9756873980343003,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 1.0327290229424033e-05,
|
|
"loss": 0.1651,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.9772790577374557,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.0311246590726777e-05,
|
|
"loss": 0.1663,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.9788707174406112,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 1.0295202952029521e-05,
|
|
"loss": 0.1985,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.9804623771437667,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.0279159313332265e-05,
|
|
"loss": 0.1623,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.9820540368469222,
|
|
"grad_norm": 4.8125,
|
|
"learning_rate": 1.026311567463501e-05,
|
|
"loss": 0.1522,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.9836456965500776,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.024707203593775e-05,
|
|
"loss": 0.1565,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.985237356253233,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 1.0231028397240494e-05,
|
|
"loss": 0.1481,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.9868290159563885,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.0214984758543238e-05,
|
|
"loss": 0.1811,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.988420675659544,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 1.0198941119845982e-05,
|
|
"loss": 0.1759,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.9900123353626995,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 1.0182897481148726e-05,
|
|
"loss": 0.171,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.991603995065855,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 1.016685384245147e-05,
|
|
"loss": 0.2191,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.9931956547690104,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 1.0150810203754213e-05,
|
|
"loss": 0.1673,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.9947873144721658,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.0134766565056955e-05,
|
|
"loss": 0.1335,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.9963789741753213,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.0118722926359699e-05,
|
|
"loss": 0.1603,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.9979706338784767,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.0102679287662443e-05,
|
|
"loss": 0.1602,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.9979706338784767,
|
|
"eval_loss": 0.16035068035125732,
|
|
"eval_runtime": 17.2462,
|
|
"eval_samples_per_second": 14.728,
|
|
"eval_steps_per_second": 14.728,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.9995622935816323,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 1.0086635648965185e-05,
|
|
"loss": 0.1963,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 1.0011141617922088,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 1.007059201026793e-05,
|
|
"loss": 0.1518,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 1.0027058214953644,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 1.0054548371570673e-05,
|
|
"loss": 0.1133,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 1.0042974811985197,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.0038504732873418e-05,
|
|
"loss": 0.0914,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 1.0058891409016752,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 1.002246109417616e-05,
|
|
"loss": 0.1315,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 1.0074808006048306,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 1.0006417455478902e-05,
|
|
"loss": 0.1148,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 1.0090724603079861,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 9.990373816781646e-06,
|
|
"loss": 0.1373,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 1.0106641200111417,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 9.97433017808439e-06,
|
|
"loss": 0.1209,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 1.012255779714297,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 9.958286539387134e-06,
|
|
"loss": 0.1528,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 1.0138474394174526,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 9.942242900689877e-06,
|
|
"loss": 0.1487,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 1.015439099120608,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 9.92619926199262e-06,
|
|
"loss": 0.1027,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 1.0170307588237635,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 9.910155623295365e-06,
|
|
"loss": 0.1325,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 1.018622418526919,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 9.894111984598109e-06,
|
|
"loss": 0.0845,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 1.0202140782300744,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 9.878068345900851e-06,
|
|
"loss": 0.109,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 1.02180573793323,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 9.862024707203595e-06,
|
|
"loss": 0.144,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 1.0233973976363853,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 9.845981068506339e-06,
|
|
"loss": 0.099,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 1.0249890573395408,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 9.829937429809081e-06,
|
|
"loss": 0.1193,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 1.0265807170426964,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 9.813893791111825e-06,
|
|
"loss": 0.0981,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 1.0281723767458517,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 9.797850152414568e-06,
|
|
"loss": 0.1065,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 1.0297640364490073,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 9.781806513717312e-06,
|
|
"loss": 0.1236,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 1.031196530181847,
|
|
"eval_loss": 0.1670570820569992,
|
|
"eval_runtime": 17.3989,
|
|
"eval_samples_per_second": 14.599,
|
|
"eval_steps_per_second": 14.599,
|
|
"step": 6479
|
|
},
|
|
{
|
|
"epoch": 1.0313556961521626,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 9.765762875020054e-06,
|
|
"loss": 0.1109,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 1.0329473558553182,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 9.749719236322798e-06,
|
|
"loss": 0.1307,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 1.0345390155584735,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 9.733675597625542e-06,
|
|
"loss": 0.1659,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 1.036130675261629,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 9.717631958928285e-06,
|
|
"loss": 0.1027,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 1.0377223349647846,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 9.701588320231029e-06,
|
|
"loss": 0.0739,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 1.03931399466794,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 9.685544681533773e-06,
|
|
"loss": 0.1098,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 1.0409056543710955,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 9.669501042836517e-06,
|
|
"loss": 0.1182,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 1.0424973140742508,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 9.653457404139259e-06,
|
|
"loss": 0.0948,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 1.0440889737774064,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 9.637413765442003e-06,
|
|
"loss": 0.1051,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 1.045680633480562,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 9.621370126744747e-06,
|
|
"loss": 0.1113,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 1.0472722931837173,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 9.60532648804749e-06,
|
|
"loss": 0.1016,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 1.0488639528868728,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 9.589282849350233e-06,
|
|
"loss": 0.073,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 1.0504556125900282,
|
|
"grad_norm": 7.0625,
|
|
"learning_rate": 9.573239210652978e-06,
|
|
"loss": 0.131,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 1.0520472722931837,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 9.557195571955722e-06,
|
|
"loss": 0.0949,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 1.0536389319963393,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 9.541151933258464e-06,
|
|
"loss": 0.1305,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 1.0552305916994946,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 9.525108294561208e-06,
|
|
"loss": 0.123,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 1.0568222514026502,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 9.50906465586395e-06,
|
|
"loss": 0.1029,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 1.0584139111058055,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 9.493021017166694e-06,
|
|
"loss": 0.1459,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 1.060005570808961,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 9.476977378469437e-06,
|
|
"loss": 0.0936,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 1.0615972305121164,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 9.46093373977218e-06,
|
|
"loss": 0.0975,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 1.063188890215272,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 9.444890101074925e-06,
|
|
"loss": 0.1044,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 1.0644622179777963,
|
|
"eval_loss": 0.16782476007938385,
|
|
"eval_runtime": 17.2113,
|
|
"eval_samples_per_second": 14.758,
|
|
"eval_steps_per_second": 14.758,
|
|
"step": 6688
|
|
},
|
|
{
|
|
"epoch": 1.0647805499184275,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 9.428846462377667e-06,
|
|
"loss": 0.1174,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 1.0663722096215829,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 9.412802823680411e-06,
|
|
"loss": 0.0994,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 1.0679638693247384,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 9.396759184983155e-06,
|
|
"loss": 0.1186,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 1.0695555290278937,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 9.380715546285897e-06,
|
|
"loss": 0.1066,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 1.0711471887310493,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 9.364671907588641e-06,
|
|
"loss": 0.1285,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 1.0727388484342049,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 9.348628268891385e-06,
|
|
"loss": 0.1461,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 1.0743305081373602,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 9.33258463019413e-06,
|
|
"loss": 0.0994,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 1.0759221678405158,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 9.316540991496872e-06,
|
|
"loss": 0.1033,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 1.077513827543671,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 9.300497352799616e-06,
|
|
"loss": 0.1152,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 1.0791054872468266,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 9.28445371410236e-06,
|
|
"loss": 0.1019,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 1.0806971469499822,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 9.268410075405104e-06,
|
|
"loss": 0.1095,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 1.0822888066531375,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 9.252366436707846e-06,
|
|
"loss": 0.1302,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 1.083880466356293,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 9.23632279801059e-06,
|
|
"loss": 0.1201,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 1.0854721260594484,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 9.220279159313333e-06,
|
|
"loss": 0.0771,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 1.087063785762604,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 9.204235520616077e-06,
|
|
"loss": 0.0743,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 1.0886554454657593,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 9.188191881918819e-06,
|
|
"loss": 0.1023,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 1.0902471051689149,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 9.172148243221563e-06,
|
|
"loss": 0.1105,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 1.0918387648720704,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 9.156104604524307e-06,
|
|
"loss": 0.1228,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 1.0934304245752258,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 9.14006096582705e-06,
|
|
"loss": 0.1516,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 1.0950220842783813,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 9.124017327129793e-06,
|
|
"loss": 0.1466,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 1.0966137439815367,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 9.107973688432538e-06,
|
|
"loss": 0.1151,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 1.0977279057737457,
|
|
"eval_loss": 0.16758352518081665,
|
|
"eval_runtime": 17.5957,
|
|
"eval_samples_per_second": 14.435,
|
|
"eval_steps_per_second": 14.435,
|
|
"step": 6897
|
|
},
|
|
{
|
|
"epoch": 1.0982054036846922,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 9.09193004973528e-06,
|
|
"loss": 0.1258,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 1.0997970633878478,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 9.075886411038024e-06,
|
|
"loss": 0.1097,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 1.101388723091003,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 9.059842772340768e-06,
|
|
"loss": 0.1177,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 1.1029803827941587,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 9.043799133643512e-06,
|
|
"loss": 0.0956,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 1.104572042497314,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 9.027755494946254e-06,
|
|
"loss": 0.1059,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 1.1061637022004696,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 9.011711856248998e-06,
|
|
"loss": 0.1147,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 1.107755361903625,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 8.995668217551742e-06,
|
|
"loss": 0.1425,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 1.1093470216067804,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 8.979624578854485e-06,
|
|
"loss": 0.1119,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 1.110938681309936,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 8.963580940157229e-06,
|
|
"loss": 0.1251,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 1.1125303410130913,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 8.947537301459973e-06,
|
|
"loss": 0.1107,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 1.114122000716247,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 8.931493662762715e-06,
|
|
"loss": 0.1059,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 1.1157136604194022,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 8.915450024065459e-06,
|
|
"loss": 0.1601,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 1.1173053201225578,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 8.899406385368201e-06,
|
|
"loss": 0.1057,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 1.1188969798257133,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 8.883362746670945e-06,
|
|
"loss": 0.113,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 1.1204886395288687,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 8.867319107973688e-06,
|
|
"loss": 0.1284,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 1.1220802992320242,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 8.851275469276432e-06,
|
|
"loss": 0.1107,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 1.1236719589351796,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 8.835231830579176e-06,
|
|
"loss": 0.0859,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 1.1252636186383351,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 8.81918819188192e-06,
|
|
"loss": 0.1267,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 1.1268552783414907,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 8.803144553184662e-06,
|
|
"loss": 0.1235,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 1.128446938044646,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 8.787100914487406e-06,
|
|
"loss": 0.138,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 1.1300385977478016,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 8.77105727579015e-06,
|
|
"loss": 0.117,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 1.1309935935696949,
|
|
"eval_loss": 0.1663266271352768,
|
|
"eval_runtime": 17.4815,
|
|
"eval_samples_per_second": 14.53,
|
|
"eval_steps_per_second": 14.53,
|
|
"step": 7106
|
|
},
|
|
{
|
|
"epoch": 1.131630257450957,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 8.755013637092893e-06,
|
|
"loss": 0.0908,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 1.1332219171541125,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 8.738969998395637e-06,
|
|
"loss": 0.1413,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 1.134813576857268,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 8.72292635969838e-06,
|
|
"loss": 0.1131,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 1.1364052365604234,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 8.706882721001125e-06,
|
|
"loss": 0.1304,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 1.137996896263579,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 8.690839082303867e-06,
|
|
"loss": 0.0982,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 1.1395885559667343,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 8.674795443606611e-06,
|
|
"loss": 0.1061,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 1.1411802156698898,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 8.658751804909355e-06,
|
|
"loss": 0.0898,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 1.1427718753730454,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 8.642708166212098e-06,
|
|
"loss": 0.1521,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 1.1443635350762007,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 8.626664527514842e-06,
|
|
"loss": 0.107,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 1.1459551947793563,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 8.610620888817584e-06,
|
|
"loss": 0.1045,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 1.1475468544825116,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 8.594577250120328e-06,
|
|
"loss": 0.1124,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 1.1491385141856671,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 8.57853361142307e-06,
|
|
"loss": 0.1278,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 1.1507301738888225,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 8.562489972725814e-06,
|
|
"loss": 0.1309,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 1.152321833591978,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 8.546446334028558e-06,
|
|
"loss": 0.1041,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 1.1539134932951336,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 8.530402695331302e-06,
|
|
"loss": 0.1043,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 1.155505152998289,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 8.514359056634045e-06,
|
|
"loss": 0.1832,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 1.1570968127014445,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 8.498315417936789e-06,
|
|
"loss": 0.1087,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 1.1586884724045998,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 8.482271779239533e-06,
|
|
"loss": 0.1106,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 1.1602801321077554,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 8.466228140542275e-06,
|
|
"loss": 0.1033,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 1.1618717918109107,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 8.450184501845019e-06,
|
|
"loss": 0.1071,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 1.1634634515140663,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 8.434140863147763e-06,
|
|
"loss": 0.1177,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 1.164259281365644,
|
|
"eval_loss": 0.16596829891204834,
|
|
"eval_runtime": 20.0453,
|
|
"eval_samples_per_second": 12.671,
|
|
"eval_steps_per_second": 12.671,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"epoch": 1.1650551112172218,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 8.418097224450507e-06,
|
|
"loss": 0.1346,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 1.1666467709203772,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 8.40205358575325e-06,
|
|
"loss": 0.104,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 1.1682384306235327,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 8.386009947055994e-06,
|
|
"loss": 0.1555,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 1.169830090326688,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 8.369966308358738e-06,
|
|
"loss": 0.1115,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 1.1714217500298436,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 8.35392266966148e-06,
|
|
"loss": 0.157,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 1.1730134097329992,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 8.337879030964224e-06,
|
|
"loss": 0.1135,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 1.1746050694361545,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 8.321835392266966e-06,
|
|
"loss": 0.1326,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 1.17619672913931,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 8.30579175356971e-06,
|
|
"loss": 0.1092,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 1.1777883888424654,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 8.289748114872453e-06,
|
|
"loss": 0.1898,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 1.179380048545621,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 8.273704476175197e-06,
|
|
"loss": 0.117,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 1.1809717082487765,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 8.25766083747794e-06,
|
|
"loss": 0.1007,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 1.1825633679519318,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 8.241617198780683e-06,
|
|
"loss": 0.1277,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 1.1841550276550874,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 8.225573560083427e-06,
|
|
"loss": 0.1078,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 1.1857466873582427,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 8.209529921386171e-06,
|
|
"loss": 0.1074,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 1.1873383470613983,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 8.193486282688915e-06,
|
|
"loss": 0.1125,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 1.1889300067645538,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 8.177442643991658e-06,
|
|
"loss": 0.1357,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 1.1905216664677092,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 8.161399005294402e-06,
|
|
"loss": 0.0859,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 1.1921133261708647,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 8.145355366597146e-06,
|
|
"loss": 0.0982,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 1.19370498587402,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 8.129311727899888e-06,
|
|
"loss": 0.0773,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 1.1952966455771756,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 8.113268089202632e-06,
|
|
"loss": 0.1433,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 1.1968883052803312,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 8.097224450505376e-06,
|
|
"loss": 0.121,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 1.1975249691615932,
|
|
"eval_loss": 0.16437803208827972,
|
|
"eval_runtime": 17.3473,
|
|
"eval_samples_per_second": 14.642,
|
|
"eval_steps_per_second": 14.642,
|
|
"step": 7524
|
|
},
|
|
{
|
|
"epoch": 1.1984799649834865,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 8.08118081180812e-06,
|
|
"loss": 0.1105,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 1.200071624686642,
|
|
"grad_norm": 6.34375,
|
|
"learning_rate": 8.065137173110862e-06,
|
|
"loss": 0.1264,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 1.2016632843897974,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 8.049093534413606e-06,
|
|
"loss": 0.1108,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 1.203254944092953,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 8.033049895716349e-06,
|
|
"loss": 0.098,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 1.2048466037961083,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 8.017006257019093e-06,
|
|
"loss": 0.123,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 1.2064382634992639,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 8.000962618321835e-06,
|
|
"loss": 0.1421,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 1.2080299232024192,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 7.984918979624579e-06,
|
|
"loss": 0.1058,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 1.2096215829055748,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 7.968875340927323e-06,
|
|
"loss": 0.1396,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 1.2112132426087303,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 7.952831702230066e-06,
|
|
"loss": 0.0962,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 1.2128049023118856,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 7.93678806353281e-06,
|
|
"loss": 0.1195,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 1.2143965620150412,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 7.920744424835554e-06,
|
|
"loss": 0.1169,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 1.2159882217181965,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 7.904700786138296e-06,
|
|
"loss": 0.0962,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 1.217579881421352,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 7.88865714744104e-06,
|
|
"loss": 0.1326,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 1.2191715411245077,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 7.872613508743784e-06,
|
|
"loss": 0.0989,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 1.220763200827663,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 7.856569870046528e-06,
|
|
"loss": 0.1229,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 1.2223548605308185,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 7.84052623134927e-06,
|
|
"loss": 0.1187,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 1.2239465202339739,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 7.824482592652014e-06,
|
|
"loss": 0.0808,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"epoch": 1.2255381799371294,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 7.808438953954758e-06,
|
|
"loss": 0.1194,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 1.227129839640285,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 7.792395315257502e-06,
|
|
"loss": 0.0859,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"epoch": 1.2287214993434403,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 7.776351676560245e-06,
|
|
"loss": 0.1038,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 1.2303131590465959,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 7.760308037862989e-06,
|
|
"loss": 0.1059,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"epoch": 1.2307906569575424,
|
|
"eval_loss": 0.16424033045768738,
|
|
"eval_runtime": 17.3173,
|
|
"eval_samples_per_second": 14.667,
|
|
"eval_steps_per_second": 14.667,
|
|
"step": 7733
|
|
},
|
|
{
|
|
"epoch": 1.2319048187497512,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 7.744264399165731e-06,
|
|
"loss": 0.099,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 1.2334964784529068,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 7.728220760468475e-06,
|
|
"loss": 0.0997,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 1.2350881381560623,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 7.712177121771218e-06,
|
|
"loss": 0.108,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 1.2366797978592177,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 7.696133483073962e-06,
|
|
"loss": 0.1396,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"epoch": 1.2382714575623732,
|
|
"grad_norm": 7.46875,
|
|
"learning_rate": 7.680089844376706e-06,
|
|
"loss": 0.129,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 1.2398631172655286,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 7.664046205679448e-06,
|
|
"loss": 0.0896,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"epoch": 1.2414547769686841,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 7.648002566982192e-06,
|
|
"loss": 0.1546,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 1.2430464366718397,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 7.631958928284936e-06,
|
|
"loss": 0.1331,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"epoch": 1.244638096374995,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 7.615915289587678e-06,
|
|
"loss": 0.098,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 1.2462297560781506,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 7.599871650890422e-06,
|
|
"loss": 0.1382,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"epoch": 1.247821415781306,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 7.583828012193166e-06,
|
|
"loss": 0.1146,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 1.2494130754844615,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 7.5677843734959096e-06,
|
|
"loss": 0.1688,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 1.251004735187617,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 7.551740734798653e-06,
|
|
"loss": 0.1049,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 1.2525963948907723,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 7.535697096101397e-06,
|
|
"loss": 0.1263,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"epoch": 1.2541880545939277,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 7.51965345740414e-06,
|
|
"loss": 0.1077,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 1.2557797142970832,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 7.503609818706883e-06,
|
|
"loss": 0.101,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"epoch": 1.2573713740002388,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 7.487566180009626e-06,
|
|
"loss": 0.1352,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 1.2589630337033944,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 7.47152254131237e-06,
|
|
"loss": 0.0964,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"epoch": 1.2605546934065497,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 7.455478902615114e-06,
|
|
"loss": 0.1007,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 1.262146353109705,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 7.439435263917857e-06,
|
|
"loss": 0.1479,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"epoch": 1.2637380128128606,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 7.423391625220601e-06,
|
|
"loss": 0.0849,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 1.2640563447534916,
|
|
"eval_loss": 0.16628886759281158,
|
|
"eval_runtime": 20.3803,
|
|
"eval_samples_per_second": 12.463,
|
|
"eval_steps_per_second": 12.463,
|
|
"step": 7942
|
|
},
|
|
{
|
|
"epoch": 1.2653296725160161,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 7.407347986523345e-06,
|
|
"loss": 0.0775,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 1.2669213322191715,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 7.391304347826087e-06,
|
|
"loss": 0.1022,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 1.268512991922327,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 7.375260709128831e-06,
|
|
"loss": 0.0823,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"epoch": 1.2701046516254824,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 7.359217070431574e-06,
|
|
"loss": 0.1509,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 1.271696311328638,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 7.343173431734318e-06,
|
|
"loss": 0.1006,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"epoch": 1.2732879710317935,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 7.327129793037061e-06,
|
|
"loss": 0.1094,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 1.2748796307349488,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 7.311086154339805e-06,
|
|
"loss": 0.1252,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"epoch": 1.2764712904381044,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 7.295042515642549e-06,
|
|
"loss": 0.1436,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 1.2780629501412597,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 7.278998876945291e-06,
|
|
"loss": 0.1174,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"epoch": 1.2796546098444153,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 7.262955238248035e-06,
|
|
"loss": 0.1091,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 1.2812462695475708,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 7.246911599550779e-06,
|
|
"loss": 0.139,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 1.2828379292507261,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 7.230867960853522e-06,
|
|
"loss": 0.1083,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 1.2844295889538817,
|
|
"grad_norm": 4.90625,
|
|
"learning_rate": 7.214824322156266e-06,
|
|
"loss": 0.1272,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"epoch": 1.286021248657037,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 7.198780683459009e-06,
|
|
"loss": 0.1195,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 1.2876129083601926,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 7.182737044761753e-06,
|
|
"loss": 0.1134,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"epoch": 1.2892045680633482,
|
|
"grad_norm": 7.1875,
|
|
"learning_rate": 7.166693406064495e-06,
|
|
"loss": 0.1425,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 1.2907962277665035,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 7.150649767367239e-06,
|
|
"loss": 0.1234,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"epoch": 1.292387887469659,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 7.134606128669983e-06,
|
|
"loss": 0.1269,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 1.2939795471728144,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 7.118562489972726e-06,
|
|
"loss": 0.1339,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"epoch": 1.29557120687597,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 7.1025188512754696e-06,
|
|
"loss": 0.0994,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 1.2971628665791255,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 7.086475212578214e-06,
|
|
"loss": 0.1115,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 1.297322032549441,
|
|
"eval_loss": 0.16382403671741486,
|
|
"eval_runtime": 17.3715,
|
|
"eval_samples_per_second": 14.622,
|
|
"eval_steps_per_second": 14.622,
|
|
"step": 8151
|
|
},
|
|
{
|
|
"epoch": 1.2987545262822808,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 7.070431573880957e-06,
|
|
"loss": 0.1205,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 1.3003461859854364,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 7.0543879351837e-06,
|
|
"loss": 0.1583,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"epoch": 1.3019378456885917,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 7.038344296486443e-06,
|
|
"loss": 0.1066,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 1.3035295053917473,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 7.022300657789187e-06,
|
|
"loss": 0.1401,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"epoch": 1.3051211650949028,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 7.006257019091931e-06,
|
|
"loss": 0.1082,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 1.3067128247980582,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 6.9902133803946736e-06,
|
|
"loss": 0.0951,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"epoch": 1.3083044845012135,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 6.974169741697418e-06,
|
|
"loss": 0.0987,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 1.309896144204369,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 6.958126103000162e-06,
|
|
"loss": 0.123,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"epoch": 1.3114878039075246,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 6.942082464302905e-06,
|
|
"loss": 0.0767,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 1.3130794636106802,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 6.926038825605648e-06,
|
|
"loss": 0.095,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 1.3146711233138355,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 6.909995186908391e-06,
|
|
"loss": 0.0944,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 1.3162627830169908,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 6.893951548211135e-06,
|
|
"loss": 0.0572,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"epoch": 1.3178544427201464,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 6.8779079095138776e-06,
|
|
"loss": 0.1349,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 1.319446102423302,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 6.861864270816622e-06,
|
|
"loss": 0.0781,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"epoch": 1.3210377621264573,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 6.845820632119366e-06,
|
|
"loss": 0.0999,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 1.3226294218296129,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 6.829776993422109e-06,
|
|
"loss": 0.1143,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"epoch": 1.3242210815327682,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 6.813733354724852e-06,
|
|
"loss": 0.1557,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 1.3258127412359237,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 6.797689716027596e-06,
|
|
"loss": 0.1276,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"epoch": 1.3274044009390793,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 6.781646077330339e-06,
|
|
"loss": 0.1028,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 1.3289960606422346,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 6.765602438633082e-06,
|
|
"loss": 0.1192,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 1.3305877203453902,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 6.749558799935826e-06,
|
|
"loss": 0.1142,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 1.3305877203453902,
|
|
"eval_loss": 0.16286160051822662,
|
|
"eval_runtime": 17.3402,
|
|
"eval_samples_per_second": 14.648,
|
|
"eval_steps_per_second": 14.648,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 1.3321793800485455,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 6.73351516123857e-06,
|
|
"loss": 0.1373,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"epoch": 1.333771039751701,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 6.717471522541314e-06,
|
|
"loss": 0.1369,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 1.3353626994548566,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 6.701427883844056e-06,
|
|
"loss": 0.119,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"epoch": 1.336954359158012,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 6.6853842451468e-06,
|
|
"loss": 0.1429,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 1.3385460188611675,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 6.669340606449543e-06,
|
|
"loss": 0.1487,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"epoch": 1.3401376785643229,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 6.653296967752286e-06,
|
|
"loss": 0.1453,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 1.3417293382674784,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 6.6372533290550304e-06,
|
|
"loss": 0.128,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"epoch": 1.343320997970634,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 6.621209690357774e-06,
|
|
"loss": 0.1232,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 1.3449126576737893,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 6.605166051660518e-06,
|
|
"loss": 0.1187,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 1.3465043173769449,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 6.58912241296326e-06,
|
|
"loss": 0.1024,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 1.3480959770801002,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 6.573078774266004e-06,
|
|
"loss": 0.0902,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"epoch": 1.3496876367832558,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 6.557035135568748e-06,
|
|
"loss": 0.0888,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 1.3512792964864113,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 6.54099149687149e-06,
|
|
"loss": 0.147,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"epoch": 1.3528709561895667,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 6.524947858174234e-06,
|
|
"loss": 0.0883,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 1.3544626158927222,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 6.5089042194769784e-06,
|
|
"loss": 0.1072,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"epoch": 1.3560542755958775,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 6.492860580779722e-06,
|
|
"loss": 0.1115,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 1.357645935299033,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 6.476816942082465e-06,
|
|
"loss": 0.0834,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"epoch": 1.3592375950021887,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 6.460773303385208e-06,
|
|
"loss": 0.1692,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 1.360829254705344,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 6.444729664687952e-06,
|
|
"loss": 0.1119,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 1.3624209144084993,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 6.428686025990694e-06,
|
|
"loss": 0.1069,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 1.3638534081413394,
|
|
"eval_loss": 0.16366249322891235,
|
|
"eval_runtime": 17.0689,
|
|
"eval_samples_per_second": 14.881,
|
|
"eval_steps_per_second": 14.881,
|
|
"step": 8569
|
|
},
|
|
{
|
|
"epoch": 1.3640125741116549,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 6.412642387293438e-06,
|
|
"loss": 0.0932,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"epoch": 1.3656042338148104,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 6.3965987485961824e-06,
|
|
"loss": 0.0843,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 1.3671958935179658,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 6.380555109898926e-06,
|
|
"loss": 0.0914,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"epoch": 1.3687875532211213,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 6.364511471201669e-06,
|
|
"loss": 0.1207,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 1.3703792129242767,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 6.348467832504413e-06,
|
|
"loss": 0.1253,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"epoch": 1.3719708726274322,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 6.332424193807156e-06,
|
|
"loss": 0.0879,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 1.3735625323305878,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 6.316380555109899e-06,
|
|
"loss": 0.1296,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"epoch": 1.3751541920337431,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 6.300336916412642e-06,
|
|
"loss": 0.0998,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 1.3767458517368987,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 6.2842932777153864e-06,
|
|
"loss": 0.1262,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 1.378337511440054,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 6.2682496390181305e-06,
|
|
"loss": 0.0973,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 1.3799291711432096,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 6.252206000320873e-06,
|
|
"loss": 0.0841,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"epoch": 1.3815208308463651,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 6.236162361623617e-06,
|
|
"loss": 0.0953,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 1.3831124905495205,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 6.22011872292636e-06,
|
|
"loss": 0.0889,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"epoch": 1.384704150252676,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 6.204075084229104e-06,
|
|
"loss": 0.1135,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 1.3862958099558313,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 6.188031445531847e-06,
|
|
"loss": 0.1129,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"epoch": 1.387887469658987,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 6.1719878068345904e-06,
|
|
"loss": 0.1141,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 1.3894791293621425,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 6.1559441681373345e-06,
|
|
"loss": 0.1392,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 1.3910707890652978,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 6.139900529440077e-06,
|
|
"loss": 0.0961,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 1.3926624487684534,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 6.123856890742821e-06,
|
|
"loss": 0.1084,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 1.3942541084716087,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 6.107813252045565e-06,
|
|
"loss": 0.1206,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 1.3958457681747642,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 6.091769613348308e-06,
|
|
"loss": 0.0844,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"epoch": 1.3971190959372886,
|
|
"eval_loss": 0.16162684559822083,
|
|
"eval_runtime": 17.2673,
|
|
"eval_samples_per_second": 14.71,
|
|
"eval_steps_per_second": 14.71,
|
|
"step": 8778
|
|
},
|
|
{
|
|
"epoch": 1.3974374278779198,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 6.075725974651051e-06,
|
|
"loss": 0.1235,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 1.3990290875810751,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 6.059682335953795e-06,
|
|
"loss": 0.1296,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"epoch": 1.4006207472842307,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 6.0436386972565385e-06,
|
|
"loss": 0.0782,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 1.402212406987386,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 6.027595058559282e-06,
|
|
"loss": 0.143,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"epoch": 1.4038040666905416,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 6.011551419862025e-06,
|
|
"loss": 0.1508,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 1.4053957263936971,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 5.995507781164769e-06,
|
|
"loss": 0.1162,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"epoch": 1.4069873860968525,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 5.979464142467513e-06,
|
|
"loss": 0.0886,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 1.408579045800008,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 5.963420503770255e-06,
|
|
"loss": 0.1068,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 1.4101707055031634,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 5.947376865072999e-06,
|
|
"loss": 0.1112,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 1.411762365206319,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 5.9313332263757424e-06,
|
|
"loss": 0.105,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"epoch": 1.4133540249094745,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 5.915289587678486e-06,
|
|
"loss": 0.0893,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 1.4149456846126298,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 5.89924594898123e-06,
|
|
"loss": 0.1071,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"epoch": 1.4165373443157852,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 5.883202310283973e-06,
|
|
"loss": 0.1105,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 1.4181290040189407,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 5.867158671586717e-06,
|
|
"loss": 0.1019,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"epoch": 1.4197206637220963,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 5.851115032889459e-06,
|
|
"loss": 0.1251,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 1.4213123234252516,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 5.835071394192203e-06,
|
|
"loss": 0.1304,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"epoch": 1.4229039831284072,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 5.819027755494947e-06,
|
|
"loss": 0.0898,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 1.4244956428315625,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 5.80298411679769e-06,
|
|
"loss": 0.0946,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 1.426087302534718,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 5.786940478100434e-06,
|
|
"loss": 0.1687,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 1.4276789622378736,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 5.770896839403177e-06,
|
|
"loss": 0.0957,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"epoch": 1.429270621941029,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 5.754853200705921e-06,
|
|
"loss": 0.1061,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 1.430384783733238,
|
|
"eval_loss": 0.1616346836090088,
|
|
"eval_runtime": 17.2071,
|
|
"eval_samples_per_second": 14.761,
|
|
"eval_steps_per_second": 14.761,
|
|
"step": 8987
|
|
},
|
|
{
|
|
"epoch": 1.4308622816441845,
|
|
"grad_norm": 7.09375,
|
|
"learning_rate": 5.738809562008664e-06,
|
|
"loss": 0.1165,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"epoch": 1.4324539413473398,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 5.722765923311407e-06,
|
|
"loss": 0.129,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 1.4340456010504954,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 5.706722284614151e-06,
|
|
"loss": 0.0805,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"epoch": 1.435637260753651,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 5.690678645916894e-06,
|
|
"loss": 0.0987,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"epoch": 1.4372289204568063,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 5.674635007219638e-06,
|
|
"loss": 0.1021,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"epoch": 1.4388205801599618,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 5.658591368522382e-06,
|
|
"loss": 0.0958,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"epoch": 1.4404122398631172,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 5.642547729825125e-06,
|
|
"loss": 0.0836,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"epoch": 1.4420038995662727,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 5.626504091127868e-06,
|
|
"loss": 0.0889,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"epoch": 1.4435955592694283,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 5.610460452430612e-06,
|
|
"loss": 0.1164,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"epoch": 1.4451872189725836,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 5.594416813733355e-06,
|
|
"loss": 0.0989,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"epoch": 1.4467788786757392,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 5.5783731750360985e-06,
|
|
"loss": 0.0845,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"epoch": 1.4483705383788945,
|
|
"grad_norm": 6.78125,
|
|
"learning_rate": 5.562329536338842e-06,
|
|
"loss": 0.0988,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 1.44996219808205,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 5.546285897641586e-06,
|
|
"loss": 0.1185,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"epoch": 1.4515538577852056,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 5.53024225894433e-06,
|
|
"loss": 0.0933,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"epoch": 1.453145517488361,
|
|
"grad_norm": 4.5625,
|
|
"learning_rate": 5.514198620247072e-06,
|
|
"loss": 0.1017,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"epoch": 1.4547371771915165,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 5.498154981549816e-06,
|
|
"loss": 0.1098,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"epoch": 1.4563288368946719,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 5.482111342852559e-06,
|
|
"loss": 0.0915,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"epoch": 1.4579204965978274,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 5.466067704155303e-06,
|
|
"loss": 0.114,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"epoch": 1.459512156300983,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 5.4500240654580465e-06,
|
|
"loss": 0.0827,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"epoch": 1.4611038160041383,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 5.43398042676079e-06,
|
|
"loss": 0.1146,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"epoch": 1.4626954757072939,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 5.417936788063534e-06,
|
|
"loss": 0.1111,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"epoch": 1.4636504715291871,
|
|
"eval_loss": 0.16260889172554016,
|
|
"eval_runtime": 17.0764,
|
|
"eval_samples_per_second": 14.874,
|
|
"eval_steps_per_second": 14.874,
|
|
"step": 9196
|
|
},
|
|
{
|
|
"epoch": 1.4642871354104492,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 5.401893149366276e-06,
|
|
"loss": 0.0995,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 1.4658787951136047,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 5.38584951066902e-06,
|
|
"loss": 0.1064,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"epoch": 1.4674704548167603,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 5.369805871971764e-06,
|
|
"loss": 0.1465,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"epoch": 1.4690621145199156,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 5.353762233274507e-06,
|
|
"loss": 0.1402,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"epoch": 1.470653774223071,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 5.3377185945772505e-06,
|
|
"loss": 0.0763,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"epoch": 1.4722454339262265,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 5.321674955879994e-06,
|
|
"loss": 0.1253,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"epoch": 1.473837093629382,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 5.305631317182738e-06,
|
|
"loss": 0.095,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"epoch": 1.4754287533325374,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 5.289587678485481e-06,
|
|
"loss": 0.0849,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"epoch": 1.477020413035693,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 5.273544039788224e-06,
|
|
"loss": 0.1178,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"epoch": 1.4786120727388483,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 5.257500401090968e-06,
|
|
"loss": 0.1144,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"epoch": 1.4802037324420039,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 5.241456762393712e-06,
|
|
"loss": 0.1065,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 1.4817953921451594,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 5.2254131236964545e-06,
|
|
"loss": 0.1204,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"epoch": 1.4833870518483148,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 5.2093694849991985e-06,
|
|
"loss": 0.0922,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"epoch": 1.4849787115514703,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 5.193325846301942e-06,
|
|
"loss": 0.1072,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"epoch": 1.4865703712546257,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 5.177282207604685e-06,
|
|
"loss": 0.1151,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"epoch": 1.4881620309577812,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 5.161238568907428e-06,
|
|
"loss": 0.0816,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"epoch": 1.4897536906609368,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 5.145194930210172e-06,
|
|
"loss": 0.1163,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"epoch": 1.491345350364092,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 5.129151291512916e-06,
|
|
"loss": 0.1019,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"epoch": 1.4929370100672477,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 5.1131076528156585e-06,
|
|
"loss": 0.1051,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"epoch": 1.494528669770403,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 5.0970640141184025e-06,
|
|
"loss": 0.1389,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"epoch": 1.4961203294735586,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 5.0810203754211465e-06,
|
|
"loss": 0.1334,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 1.4969161593251363,
|
|
"eval_loss": 0.16110365092754364,
|
|
"eval_runtime": 19.9895,
|
|
"eval_samples_per_second": 12.707,
|
|
"eval_steps_per_second": 12.707,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"epoch": 1.497711989176714,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 5.064976736723889e-06,
|
|
"loss": 0.0848,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"epoch": 1.4993036488798694,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 5.048933098026633e-06,
|
|
"loss": 0.1045,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"epoch": 1.500895308583025,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 5.032889459329376e-06,
|
|
"loss": 0.1423,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"epoch": 1.5024869682861803,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 5.01684582063212e-06,
|
|
"loss": 0.1259,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 1.504078627989336,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 5.000802181934863e-06,
|
|
"loss": 0.1082,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"epoch": 1.5056702876924914,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 4.9847585432376065e-06,
|
|
"loss": 0.1207,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"epoch": 1.5072619473956468,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.96871490454035e-06,
|
|
"loss": 0.1293,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"epoch": 1.5088536070988021,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 4.952671265843094e-06,
|
|
"loss": 0.1337,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"epoch": 1.5104452668019577,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 4.936627627145837e-06,
|
|
"loss": 0.0773,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"epoch": 1.5120369265051132,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 4.920583988448581e-06,
|
|
"loss": 0.1782,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 1.5136285862082688,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 4.904540349751324e-06,
|
|
"loss": 0.1468,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"epoch": 1.5152202459114241,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 4.888496711054067e-06,
|
|
"loss": 0.1353,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"epoch": 1.5168119056145795,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 4.8724530723568105e-06,
|
|
"loss": 0.1066,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"epoch": 1.518403565317735,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 4.8564094336595545e-06,
|
|
"loss": 0.1054,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"epoch": 1.5199952250208906,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 4.840365794962298e-06,
|
|
"loss": 0.1189,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"epoch": 1.5215868847240461,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 4.824322156265041e-06,
|
|
"loss": 0.151,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"epoch": 1.5231785444272015,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 4.808278517567785e-06,
|
|
"loss": 0.0774,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"epoch": 1.5247702041303568,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 4.792234878870528e-06,
|
|
"loss": 0.0992,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"epoch": 1.5263618638335124,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 4.776191240173272e-06,
|
|
"loss": 0.1203,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"epoch": 1.527953523536668,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 4.760147601476015e-06,
|
|
"loss": 0.1008,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 1.5295451832398235,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 4.7441039627787585e-06,
|
|
"loss": 0.0951,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"epoch": 1.5301818471210855,
|
|
"eval_loss": 0.16086436808109283,
|
|
"eval_runtime": 17.3965,
|
|
"eval_samples_per_second": 14.601,
|
|
"eval_steps_per_second": 14.601,
|
|
"step": 9614
|
|
},
|
|
{
|
|
"epoch": 1.5311368429429788,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 4.728060324081502e-06,
|
|
"loss": 0.1084,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"epoch": 1.5327285026461341,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 4.712016685384245e-06,
|
|
"loss": 0.1028,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"epoch": 1.5343201623492897,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 4.695973046686989e-06,
|
|
"loss": 0.0769,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"epoch": 1.5359118220524453,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 4.679929407989732e-06,
|
|
"loss": 0.0923,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"epoch": 1.5375034817556008,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 4.663885769292476e-06,
|
|
"loss": 0.0944,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"epoch": 1.5390951414587561,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 4.647842130595219e-06,
|
|
"loss": 0.1146,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"epoch": 1.5406868011619115,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 4.631798491897963e-06,
|
|
"loss": 0.1188,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"epoch": 1.542278460865067,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 4.6157548532007065e-06,
|
|
"loss": 0.1112,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"epoch": 1.5438701205682226,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 4.59971121450345e-06,
|
|
"loss": 0.107,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 1.545461780271378,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 4.583667575806193e-06,
|
|
"loss": 0.1518,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"epoch": 1.5470534399745335,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 4.567623937108936e-06,
|
|
"loss": 0.1106,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"epoch": 1.5486450996776888,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 4.55158029841168e-06,
|
|
"loss": 0.0991,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"epoch": 1.5502367593808444,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 4.535536659714423e-06,
|
|
"loss": 0.1061,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"epoch": 1.551828419084,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 4.519493021017167e-06,
|
|
"loss": 0.1138,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"epoch": 1.5534200787871553,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 4.5034493823199105e-06,
|
|
"loss": 0.0924,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"epoch": 1.5550117384903108,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 4.4874057436226545e-06,
|
|
"loss": 0.0881,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"epoch": 1.5566033981934662,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 4.471362104925398e-06,
|
|
"loss": 0.0741,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"epoch": 1.5581950578966217,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 4.455318466228141e-06,
|
|
"loss": 0.1205,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"epoch": 1.5597867175997773,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 4.439274827530884e-06,
|
|
"loss": 0.0847,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 1.5613783773029326,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 4.423231188833627e-06,
|
|
"loss": 0.1015,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"epoch": 1.562970037006088,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 4.407187550136371e-06,
|
|
"loss": 0.1196,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"epoch": 1.5634475349170347,
|
|
"eval_loss": 0.16066288948059082,
|
|
"eval_runtime": 20.0261,
|
|
"eval_samples_per_second": 12.683,
|
|
"eval_steps_per_second": 12.683,
|
|
"step": 9823
|
|
},
|
|
{
|
|
"epoch": 1.5645616967092435,
|
|
"grad_norm": 6.65625,
|
|
"learning_rate": 4.3911439114391145e-06,
|
|
"loss": 0.1107,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"epoch": 1.566153356412399,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.3751002727418585e-06,
|
|
"loss": 0.1005,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"epoch": 1.5677450161155546,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 4.359056634044602e-06,
|
|
"loss": 0.0796,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"epoch": 1.56933667581871,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 4.343012995347345e-06,
|
|
"loss": 0.0792,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"epoch": 1.5709283355218653,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 4.326969356650089e-06,
|
|
"loss": 0.103,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"epoch": 1.5725199952250208,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 4.310925717952832e-06,
|
|
"loss": 0.1062,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"epoch": 1.5741116549281764,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 4.294882079255575e-06,
|
|
"loss": 0.1192,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"epoch": 1.575703314631332,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 4.2788384405583185e-06,
|
|
"loss": 0.1233,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 1.5772949743344873,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 4.2627948018610625e-06,
|
|
"loss": 0.0889,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"epoch": 1.5788866340376426,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 4.246751163163806e-06,
|
|
"loss": 0.1075,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"epoch": 1.5804782937407982,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 4.230707524466549e-06,
|
|
"loss": 0.1045,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"epoch": 1.5820699534439537,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 4.214663885769293e-06,
|
|
"loss": 0.0938,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"epoch": 1.5836616131471093,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 4.198620247072036e-06,
|
|
"loss": 0.088,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"epoch": 1.5852532728502646,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 4.18257660837478e-06,
|
|
"loss": 0.1682,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"epoch": 1.58684493255342,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 4.166532969677523e-06,
|
|
"loss": 0.1119,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"epoch": 1.5884365922565755,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 4.1504893309802665e-06,
|
|
"loss": 0.0841,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"epoch": 1.590028251959731,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 4.13444569228301e-06,
|
|
"loss": 0.077,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"epoch": 1.5916199116628866,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 4.118402053585754e-06,
|
|
"loss": 0.0816,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 1.593211571366042,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 4.102358414888497e-06,
|
|
"loss": 0.163,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"epoch": 1.5948032310691973,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 4.08631477619124e-06,
|
|
"loss": 0.115,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"epoch": 1.5963948907723529,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 4.070271137493984e-06,
|
|
"loss": 0.1422,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"epoch": 1.5967132227129839,
|
|
"eval_loss": 0.16062113642692566,
|
|
"eval_runtime": 17.5169,
|
|
"eval_samples_per_second": 14.5,
|
|
"eval_steps_per_second": 14.5,
|
|
"step": 10032
|
|
},
|
|
{
|
|
"epoch": 1.5979865504755084,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 4.054227498796727e-06,
|
|
"loss": 0.1199,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"epoch": 1.5995782101786638,
|
|
"grad_norm": 6.15625,
|
|
"learning_rate": 4.038183860099471e-06,
|
|
"loss": 0.1404,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"epoch": 1.6011698698818193,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 4.0221402214022145e-06,
|
|
"loss": 0.1328,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"epoch": 1.6027615295849746,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 4.006096582704958e-06,
|
|
"loss": 0.1547,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"epoch": 1.6043531892881302,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 3.990052944007701e-06,
|
|
"loss": 0.1319,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"epoch": 1.6059448489912858,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 3.974009305310444e-06,
|
|
"loss": 0.1186,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"epoch": 1.607536508694441,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 3.957965666613188e-06,
|
|
"loss": 0.1004,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 1.6091281683975966,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 3.941922027915931e-06,
|
|
"loss": 0.124,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"epoch": 1.610719828100752,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 3.925878389218675e-06,
|
|
"loss": 0.0799,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"epoch": 1.6123114878039075,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 3.9098347505214185e-06,
|
|
"loss": 0.1146,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"epoch": 1.613903147507063,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 3.8937911118241626e-06,
|
|
"loss": 0.1139,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"epoch": 1.6154948072102184,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 3.877747473126906e-06,
|
|
"loss": 0.1219,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"epoch": 1.6170864669133738,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 3.861703834429649e-06,
|
|
"loss": 0.0707,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"epoch": 1.6186781266165293,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 3.845660195732392e-06,
|
|
"loss": 0.0997,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"epoch": 1.6202697863196849,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 3.829616557035135e-06,
|
|
"loss": 0.1089,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"epoch": 1.6218614460228404,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.8135729183378793e-06,
|
|
"loss": 0.1189,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"epoch": 1.6234531057259958,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 3.7975292796406225e-06,
|
|
"loss": 0.1029,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 1.625044765429151,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 3.7814856409433666e-06,
|
|
"loss": 0.0923,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"epoch": 1.6266364251323067,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 3.7654420022461097e-06,
|
|
"loss": 0.1243,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"epoch": 1.6282280848354622,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 3.7493983635488534e-06,
|
|
"loss": 0.0868,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"epoch": 1.6298197445386178,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 3.7333547248515965e-06,
|
|
"loss": 0.0881,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"epoch": 1.629978910508933,
|
|
"eval_loss": 0.1606595814228058,
|
|
"eval_runtime": 6567.6934,
|
|
"eval_samples_per_second": 0.039,
|
|
"eval_steps_per_second": 0.039,
|
|
"step": 10241
|
|
},
|
|
{
|
|
"epoch": 1.631411404241773,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 3.7173110861543397e-06,
|
|
"loss": 0.1206,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"epoch": 1.6330030639449284,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 3.7012674474570838e-06,
|
|
"loss": 0.1075,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"epoch": 1.634594723648084,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 3.685223808759827e-06,
|
|
"loss": 0.1448,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"epoch": 1.6361863833512396,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 3.6691801700625706e-06,
|
|
"loss": 0.1349,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"epoch": 1.6377780430543951,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 3.6531365313653137e-06,
|
|
"loss": 0.1266,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"epoch": 1.6393697027575505,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 3.6370928926680578e-06,
|
|
"loss": 0.1059,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 1.6409613624607058,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 3.621049253970801e-06,
|
|
"loss": 0.1195,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"epoch": 1.6425530221638613,
|
|
"grad_norm": 5.0,
|
|
"learning_rate": 3.605005615273544e-06,
|
|
"loss": 0.1374,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"epoch": 1.644144681867017,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 3.5889619765762878e-06,
|
|
"loss": 0.0762,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"epoch": 1.6457363415701725,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 3.572918337879031e-06,
|
|
"loss": 0.1339,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"epoch": 1.6473280012733278,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 3.556874699181775e-06,
|
|
"loss": 0.0792,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"epoch": 1.6489196609764831,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 3.540831060484518e-06,
|
|
"loss": 0.1019,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"epoch": 1.6505113206796387,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 3.5247874217872618e-06,
|
|
"loss": 0.0731,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"epoch": 1.6521029803827942,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 3.508743783090005e-06,
|
|
"loss": 0.1262,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"epoch": 1.6536946400859496,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 3.492700144392748e-06,
|
|
"loss": 0.0992,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"epoch": 1.6552862997891051,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 3.476656505695492e-06,
|
|
"loss": 0.109,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 1.6568779594922605,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 3.4606128669982354e-06,
|
|
"loss": 0.1185,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"epoch": 1.658469619195416,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 3.444569228300979e-06,
|
|
"loss": 0.1399,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"epoch": 1.6600612788985716,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 3.428525589603722e-06,
|
|
"loss": 0.1051,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"epoch": 1.661652938601727,
|
|
"grad_norm": 6.0625,
|
|
"learning_rate": 3.412481950906466e-06,
|
|
"loss": 0.0843,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"epoch": 1.6632445983048825,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 3.3964383122092094e-06,
|
|
"loss": 0.1053,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 1.6632445983048825,
|
|
"eval_loss": 0.1602632999420166,
|
|
"eval_runtime": 17.2866,
|
|
"eval_samples_per_second": 14.693,
|
|
"eval_steps_per_second": 14.693,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 1.6648362580080378,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 3.380394673511953e-06,
|
|
"loss": 0.0974,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"epoch": 1.6664279177111934,
|
|
"grad_norm": 6.25,
|
|
"learning_rate": 3.364351034814696e-06,
|
|
"loss": 0.1608,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"epoch": 1.668019577414349,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 3.3483073961174393e-06,
|
|
"loss": 0.126,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"epoch": 1.6696112371175043,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 3.3322637574201834e-06,
|
|
"loss": 0.1425,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"epoch": 1.6712028968206596,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 3.3162201187229266e-06,
|
|
"loss": 0.1038,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 1.6727945565238151,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 3.30017648002567e-06,
|
|
"loss": 0.1103,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"epoch": 1.6743862162269707,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 3.2841328413284134e-06,
|
|
"loss": 0.102,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"epoch": 1.6759778759301263,
|
|
"grad_norm": 5.8125,
|
|
"learning_rate": 3.2680892026311574e-06,
|
|
"loss": 0.0728,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"epoch": 1.6775695356332816,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 3.2520455639339006e-06,
|
|
"loss": 0.0967,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"epoch": 1.679161195336437,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 3.2360019252366438e-06,
|
|
"loss": 0.1328,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"epoch": 1.6807528550395925,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 3.2199582865393874e-06,
|
|
"loss": 0.0952,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"epoch": 1.682344514742748,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 3.2039146478421306e-06,
|
|
"loss": 0.1204,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"epoch": 1.6839361744459036,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 3.1878710091448746e-06,
|
|
"loss": 0.13,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"epoch": 1.685527834149059,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 3.1718273704476178e-06,
|
|
"loss": 0.071,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"epoch": 1.6871194938522143,
|
|
"grad_norm": 4.9375,
|
|
"learning_rate": 3.1557837317503614e-06,
|
|
"loss": 0.1098,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 1.6887111535553698,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 3.1397400930531046e-06,
|
|
"loss": 0.0982,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"epoch": 1.6903028132585254,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 3.1236964543558478e-06,
|
|
"loss": 0.1026,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 1.691894472961681,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 3.1076528156585918e-06,
|
|
"loss": 0.0969,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"epoch": 1.6934861326648363,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 3.091609176961335e-06,
|
|
"loss": 0.1032,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"epoch": 1.6950777923679916,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 3.0755655382640786e-06,
|
|
"loss": 0.1088,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"epoch": 1.6965102861008317,
|
|
"eval_loss": 0.1603735089302063,
|
|
"eval_runtime": 17.0616,
|
|
"eval_samples_per_second": 14.887,
|
|
"eval_steps_per_second": 14.887,
|
|
"step": 10659
|
|
},
|
|
{
|
|
"epoch": 1.6966694520711472,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 3.0595218995668218e-06,
|
|
"loss": 0.0865,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"epoch": 1.6982611117743027,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 3.043478260869566e-06,
|
|
"loss": 0.1427,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"epoch": 1.6998527714774583,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 3.027434622172309e-06,
|
|
"loss": 0.1022,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"epoch": 1.7014444311806136,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 3.0113909834750526e-06,
|
|
"loss": 0.0979,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"epoch": 1.703036090883769,
|
|
"grad_norm": 7.40625,
|
|
"learning_rate": 2.9953473447777958e-06,
|
|
"loss": 0.1038,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 1.7046277505869245,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 2.979303706080539e-06,
|
|
"loss": 0.0963,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"epoch": 1.70621941029008,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.963260067383283e-06,
|
|
"loss": 0.115,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"epoch": 1.7078110699932354,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 2.947216428686026e-06,
|
|
"loss": 0.1124,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"epoch": 1.709402729696391,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 2.93117278998877e-06,
|
|
"loss": 0.0944,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"epoch": 1.7109943893995463,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 2.915129151291513e-06,
|
|
"loss": 0.0936,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"epoch": 1.7125860491027018,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 2.899085512594257e-06,
|
|
"loss": 0.084,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"epoch": 1.7141777088058574,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 2.883041873897e-06,
|
|
"loss": 0.1052,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"epoch": 1.7157693685090127,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 2.8669982351997434e-06,
|
|
"loss": 0.1204,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"epoch": 1.717361028212168,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 2.850954596502487e-06,
|
|
"loss": 0.1258,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"epoch": 1.7189526879153236,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 2.83491095780523e-06,
|
|
"loss": 0.1063,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 1.7205443476184792,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 2.818867319107974e-06,
|
|
"loss": 0.1081,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"epoch": 1.7221360073216347,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 2.8028236804107174e-06,
|
|
"loss": 0.1006,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"epoch": 1.72372766702479,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 2.786780041713461e-06,
|
|
"loss": 0.1142,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"epoch": 1.7253193267279454,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 2.770736403016204e-06,
|
|
"loss": 0.1334,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"epoch": 1.726910986431101,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 2.7546927643189474e-06,
|
|
"loss": 0.087,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"epoch": 1.7285026461342565,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 2.7386491256216914e-06,
|
|
"loss": 0.1282,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"epoch": 1.7297759738967808,
|
|
"eval_loss": 0.16021965444087982,
|
|
"eval_runtime": 2916.6803,
|
|
"eval_samples_per_second": 0.087,
|
|
"eval_steps_per_second": 0.087,
|
|
"step": 10868
|
|
},
|
|
{
|
|
"epoch": 1.730094305837412,
|
|
"grad_norm": 4.625,
|
|
"learning_rate": 2.7226054869244346e-06,
|
|
"loss": 0.1148,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"epoch": 1.7316859655405674,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 2.706561848227178e-06,
|
|
"loss": 0.1392,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"epoch": 1.7332776252437228,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 2.6905182095299214e-06,
|
|
"loss": 0.1241,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"epoch": 1.7348692849468783,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 2.6744745708326654e-06,
|
|
"loss": 0.0982,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 1.7364609446500339,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 2.6584309321354086e-06,
|
|
"loss": 0.0907,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"epoch": 1.7380526043531894,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 2.642387293438152e-06,
|
|
"loss": 0.0722,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"epoch": 1.7396442640563448,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 2.6263436547408954e-06,
|
|
"loss": 0.0933,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"epoch": 1.7412359237595,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 2.6103000160436386e-06,
|
|
"loss": 0.1008,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"epoch": 1.7428275834626556,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 2.5942563773463826e-06,
|
|
"loss": 0.099,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"epoch": 1.7444192431658112,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 2.578212738649126e-06,
|
|
"loss": 0.1023,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"epoch": 1.7460109028689668,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 2.5621690999518694e-06,
|
|
"loss": 0.0957,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"epoch": 1.747602562572122,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 2.5461254612546126e-06,
|
|
"loss": 0.0931,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"epoch": 1.7491942222752774,
|
|
"grad_norm": 7.75,
|
|
"learning_rate": 2.5300818225573566e-06,
|
|
"loss": 0.1511,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"epoch": 1.750785881978433,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 2.5140381838601e-06,
|
|
"loss": 0.0968,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 1.7523775416815885,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 2.497994545162843e-06,
|
|
"loss": 0.1611,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"epoch": 1.753969201384744,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 2.4819509064655866e-06,
|
|
"loss": 0.101,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"epoch": 1.7555608610878994,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 2.4659072677683302e-06,
|
|
"loss": 0.1209,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"epoch": 1.7571525207910548,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 2.4498636290710734e-06,
|
|
"loss": 0.132,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"epoch": 1.7587441804942103,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 2.433819990373817e-06,
|
|
"loss": 0.1051,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"epoch": 1.7603358401973659,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 2.41777635167656e-06,
|
|
"loss": 0.1002,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"epoch": 1.7619274999005212,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 2.401732712979304e-06,
|
|
"loss": 0.0979,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"epoch": 1.7630416616927302,
|
|
"eval_loss": 0.16065296530723572,
|
|
"eval_runtime": 17.1836,
|
|
"eval_samples_per_second": 14.782,
|
|
"eval_steps_per_second": 14.782,
|
|
"step": 11077
|
|
},
|
|
{
|
|
"epoch": 1.7635191596036768,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 2.3856890742820474e-06,
|
|
"loss": 0.0786,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"epoch": 1.7651108193068321,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 2.369645435584791e-06,
|
|
"loss": 0.0879,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"epoch": 1.7667024790099877,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 2.353601796887534e-06,
|
|
"loss": 0.1297,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 1.7682941387131432,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 2.337558158190278e-06,
|
|
"loss": 0.1258,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"epoch": 1.7698857984162986,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 2.321514519493021e-06,
|
|
"loss": 0.1049,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"epoch": 1.771477458119454,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 2.3054708807957646e-06,
|
|
"loss": 0.1075,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"epoch": 1.7730691178226095,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 2.2894272420985082e-06,
|
|
"loss": 0.0778,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"epoch": 1.774660777525765,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 2.2733836034012514e-06,
|
|
"loss": 0.0874,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"epoch": 1.7762524372289206,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 2.257339964703995e-06,
|
|
"loss": 0.0857,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"epoch": 1.777844096932076,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 2.2412963260067386e-06,
|
|
"loss": 0.1813,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"epoch": 1.7794357566352312,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 2.2252526873094822e-06,
|
|
"loss": 0.1298,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"epoch": 1.7810274163383868,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 2.2092090486122254e-06,
|
|
"loss": 0.1115,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"epoch": 1.7826190760415423,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 2.1931654099149686e-06,
|
|
"loss": 0.0804,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 1.784210735744698,
|
|
"grad_norm": 3.28125,
|
|
"learning_rate": 2.177121771217712e-06,
|
|
"loss": 0.1247,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"epoch": 1.7858023954478532,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 2.161078132520456e-06,
|
|
"loss": 0.1403,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"epoch": 1.7873940551510086,
|
|
"grad_norm": 4.34375,
|
|
"learning_rate": 2.1450344938231994e-06,
|
|
"loss": 0.1212,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"epoch": 1.7889857148541641,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 2.1289908551259426e-06,
|
|
"loss": 0.1052,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"epoch": 1.7905773745573197,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 2.1129472164286862e-06,
|
|
"loss": 0.1192,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"epoch": 1.7921690342604752,
|
|
"grad_norm": 4.96875,
|
|
"learning_rate": 2.09690357773143e-06,
|
|
"loss": 0.1667,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"epoch": 1.7937606939636306,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 2.080859939034173e-06,
|
|
"loss": 0.1342,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"epoch": 1.795352353666786,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 2.0648163003369166e-06,
|
|
"loss": 0.1312,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"epoch": 1.7963073494886794,
|
|
"eval_loss": 0.15989363193511963,
|
|
"eval_runtime": 17.0918,
|
|
"eval_samples_per_second": 14.861,
|
|
"eval_steps_per_second": 14.861,
|
|
"step": 11286
|
|
},
|
|
{
|
|
"epoch": 1.7969440133699415,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 2.04877266163966e-06,
|
|
"loss": 0.1256,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"epoch": 1.798535673073097,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 2.0327290229424034e-06,
|
|
"loss": 0.1239,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 1.8001273327762526,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 2.016685384245147e-06,
|
|
"loss": 0.0959,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"epoch": 1.801718992479408,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 2.0006417455478906e-06,
|
|
"loss": 0.0832,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"epoch": 1.8033106521825633,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 1.984598106850634e-06,
|
|
"loss": 0.1063,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"epoch": 1.8049023118857188,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 1.9685544681533774e-06,
|
|
"loss": 0.1093,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"epoch": 1.8064939715888744,
|
|
"grad_norm": 6.9375,
|
|
"learning_rate": 1.9525108294561206e-06,
|
|
"loss": 0.1244,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"epoch": 1.80808563129203,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 1.9364671907588642e-06,
|
|
"loss": 0.1185,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"epoch": 1.8096772909951853,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 1.920423552061608e-06,
|
|
"loss": 0.1485,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"epoch": 1.8112689506983406,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 1.9043799133643512e-06,
|
|
"loss": 0.1105,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"epoch": 1.8128606104014962,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 1.8883362746670946e-06,
|
|
"loss": 0.0758,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"epoch": 1.8144522701046517,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 1.8722926359698382e-06,
|
|
"loss": 0.1368,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 1.816043929807807,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.8562489972725816e-06,
|
|
"loss": 0.13,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"epoch": 1.8176355895109626,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 1.8402053585753248e-06,
|
|
"loss": 0.1068,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"epoch": 1.819227249214118,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 1.8241617198780684e-06,
|
|
"loss": 0.0784,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"epoch": 1.8208189089172735,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 1.8081180811808118e-06,
|
|
"loss": 0.1161,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"epoch": 1.822410568620429,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.7920744424835554e-06,
|
|
"loss": 0.1267,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"epoch": 1.8240022283235844,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.7760308037862988e-06,
|
|
"loss": 0.1004,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"epoch": 1.8255938880267397,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 1.7599871650890424e-06,
|
|
"loss": 0.0858,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"epoch": 1.8271855477298953,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.7439435263917858e-06,
|
|
"loss": 0.1168,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"epoch": 1.8287772074330508,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 1.7278998876945295e-06,
|
|
"loss": 0.0967,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"epoch": 1.8295730372846286,
|
|
"eval_loss": 0.15992198884487152,
|
|
"eval_runtime": 2931.1594,
|
|
"eval_samples_per_second": 0.087,
|
|
"eval_steps_per_second": 0.087,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"epoch": 1.8303688671362064,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.7118562489972726e-06,
|
|
"loss": 0.0995,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 1.8319605268393617,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 1.695812610300016e-06,
|
|
"loss": 0.1326,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"epoch": 1.833552186542517,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.6797689716027596e-06,
|
|
"loss": 0.1055,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"epoch": 1.8351438462456726,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 1.663725332905503e-06,
|
|
"loss": 0.0796,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"epoch": 1.8367355059488282,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 1.6476816942082466e-06,
|
|
"loss": 0.1633,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"epoch": 1.8383271656519837,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 1.63163805551099e-06,
|
|
"loss": 0.1178,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"epoch": 1.839918825355139,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.6155944168137337e-06,
|
|
"loss": 0.0816,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"epoch": 1.8415104850582944,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 1.599550778116477e-06,
|
|
"loss": 0.1055,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"epoch": 1.84310214476145,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 1.5835071394192202e-06,
|
|
"loss": 0.1197,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"epoch": 1.8446938044646055,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.5674635007219638e-06,
|
|
"loss": 0.1418,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"epoch": 1.846285464167761,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 1.5514198620247072e-06,
|
|
"loss": 0.0938,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 1.8478771238709164,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.5353762233274509e-06,
|
|
"loss": 0.0839,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"epoch": 1.8494687835740717,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 1.5193325846301942e-06,
|
|
"loss": 0.1272,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"epoch": 1.8510604432772273,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.5032889459329379e-06,
|
|
"loss": 0.099,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"epoch": 1.8526521029803829,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.4872453072356813e-06,
|
|
"loss": 0.1114,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"epoch": 1.8542437626835384,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 1.4712016685384244e-06,
|
|
"loss": 0.0827,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"epoch": 1.8558354223866937,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 1.455158029841168e-06,
|
|
"loss": 0.0996,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"epoch": 1.857427082089849,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 1.4391143911439114e-06,
|
|
"loss": 0.0852,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"epoch": 1.8590187417930046,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 1.423070752446655e-06,
|
|
"loss": 0.1203,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"epoch": 1.8606104014961602,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 1.4070271137493985e-06,
|
|
"loss": 0.1096,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"epoch": 1.8622020611993158,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 1.390983475052142e-06,
|
|
"loss": 0.0884,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 1.8628387250805778,
|
|
"eval_loss": 0.15974481403827667,
|
|
"eval_runtime": 17.5598,
|
|
"eval_samples_per_second": 14.465,
|
|
"eval_steps_per_second": 14.465,
|
|
"step": 11704
|
|
},
|
|
{
|
|
"epoch": 1.863793720902471,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 1.3749398363548855e-06,
|
|
"loss": 0.0884,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"epoch": 1.8653853806056264,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 1.358896197657629e-06,
|
|
"loss": 0.0721,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"epoch": 1.866977040308782,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.3428525589603723e-06,
|
|
"loss": 0.1462,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"epoch": 1.8685687000119375,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 1.3268089202631157e-06,
|
|
"loss": 0.0927,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"epoch": 1.8701603597150929,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 1.3107652815658593e-06,
|
|
"loss": 0.0796,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"epoch": 1.8717520194182484,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 1.2947216428686027e-06,
|
|
"loss": 0.1075,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"epoch": 1.8733436791214038,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 1.2786780041713463e-06,
|
|
"loss": 0.1183,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"epoch": 1.8749353388245593,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 1.2626343654740897e-06,
|
|
"loss": 0.1032,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"epoch": 1.8765269985277149,
|
|
"grad_norm": 3.0625,
|
|
"learning_rate": 1.246590726776833e-06,
|
|
"loss": 0.1153,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"epoch": 1.8781186582308702,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 1.2305470880795765e-06,
|
|
"loss": 0.148,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 1.8797103179340255,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.21450344938232e-06,
|
|
"loss": 0.1021,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"epoch": 1.881301977637181,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 1.1984598106850635e-06,
|
|
"loss": 0.115,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"epoch": 1.8828936373403367,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 1.1824161719878069e-06,
|
|
"loss": 0.1435,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"epoch": 1.8844852970434922,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 1.1663725332905505e-06,
|
|
"loss": 0.1027,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"epoch": 1.8860769567466475,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 1.1503288945932939e-06,
|
|
"loss": 0.1322,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"epoch": 1.8876686164498029,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.1342852558960373e-06,
|
|
"loss": 0.0919,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"epoch": 1.8892602761529584,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 1.1182416171987807e-06,
|
|
"loss": 0.0985,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"epoch": 1.890851935856114,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 1.1021979785015243e-06,
|
|
"loss": 0.1313,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"epoch": 1.8924435955592696,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 1.0861543398042677e-06,
|
|
"loss": 0.0744,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"epoch": 1.8940352552624249,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 1.070110701107011e-06,
|
|
"loss": 0.1065,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 1.8956269149655802,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.0540670624097547e-06,
|
|
"loss": 0.1501,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"epoch": 1.896104412876527,
|
|
"eval_loss": 0.15988750755786896,
|
|
"eval_runtime": 17.3258,
|
|
"eval_samples_per_second": 14.66,
|
|
"eval_steps_per_second": 14.66,
|
|
"step": 11913
|
|
},
|
|
{
|
|
"epoch": 1.8972185746687358,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.038023423712498e-06,
|
|
"loss": 0.136,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"epoch": 1.8988102343718913,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.0219797850152417e-06,
|
|
"loss": 0.098,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"epoch": 1.900401894075047,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.0059361463179849e-06,
|
|
"loss": 0.0975,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"epoch": 1.9019935537782022,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 9.898925076207285e-07,
|
|
"loss": 0.0933,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"epoch": 1.9035852134813576,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 9.738488689234719e-07,
|
|
"loss": 0.1223,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"epoch": 1.9051768731845131,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 9.578052302262155e-07,
|
|
"loss": 0.1138,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"epoch": 1.9067685328876687,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 9.417615915289588e-07,
|
|
"loss": 0.1318,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"epoch": 1.9083601925908242,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 9.257179528317023e-07,
|
|
"loss": 0.1122,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"epoch": 1.9099518522939796,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 9.096743141344458e-07,
|
|
"loss": 0.1108,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 1.911543511997135,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 8.936306754371893e-07,
|
|
"loss": 0.0837,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"epoch": 1.9131351717002905,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 8.775870367399327e-07,
|
|
"loss": 0.0896,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"epoch": 1.914726831403446,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 8.615433980426762e-07,
|
|
"loss": 0.1262,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"epoch": 1.9163184911066016,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 8.454997593454197e-07,
|
|
"loss": 0.073,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"epoch": 1.917910150809757,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 8.294561206481632e-07,
|
|
"loss": 0.1236,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"epoch": 1.9195018105129122,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 8.134124819509065e-07,
|
|
"loss": 0.1156,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"epoch": 1.9210934702160678,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 7.9736884325365e-07,
|
|
"loss": 0.1633,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"epoch": 1.9226851299192234,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 7.813252045563935e-07,
|
|
"loss": 0.0892,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"epoch": 1.9242767896223787,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 7.652815658591369e-07,
|
|
"loss": 0.094,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"epoch": 1.9258684493255342,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 7.492379271618804e-07,
|
|
"loss": 0.1152,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 1.9274601090286896,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 7.331942884646239e-07,
|
|
"loss": 0.0924,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"epoch": 1.9290517687318451,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 7.171506497673674e-07,
|
|
"loss": 0.0923,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"epoch": 1.9293701006724762,
|
|
"eval_loss": 0.1598162055015564,
|
|
"eval_runtime": 2903.1461,
|
|
"eval_samples_per_second": 0.087,
|
|
"eval_steps_per_second": 0.087,
|
|
"step": 12122
|
|
},
|
|
{
|
|
"epoch": 1.9306434284350007,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 7.011070110701107e-07,
|
|
"loss": 0.0925,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"epoch": 1.932235088138156,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 6.850633723728542e-07,
|
|
"loss": 0.1368,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"epoch": 1.9338267478413114,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 6.690197336755977e-07,
|
|
"loss": 0.1244,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"epoch": 1.935418407544467,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 6.529760949783412e-07,
|
|
"loss": 0.1111,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"epoch": 1.9370100672476225,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 6.369324562810846e-07,
|
|
"loss": 0.119,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"epoch": 1.938601726950778,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 6.208888175838281e-07,
|
|
"loss": 0.1338,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"epoch": 1.9401933866539334,
|
|
"grad_norm": 5.375,
|
|
"learning_rate": 6.048451788865716e-07,
|
|
"loss": 0.1424,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"epoch": 1.9417850463570887,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 5.88801540189315e-07,
|
|
"loss": 0.1186,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 1.9433767060602443,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 5.727579014920585e-07,
|
|
"loss": 0.1162,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"epoch": 1.9449683657633998,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 5.567142627948019e-07,
|
|
"loss": 0.1023,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"epoch": 1.9465600254665554,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 5.406706240975454e-07,
|
|
"loss": 0.1463,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"epoch": 1.9481516851697107,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 5.246269854002888e-07,
|
|
"loss": 0.0919,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"epoch": 1.949743344872866,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 5.085833467030323e-07,
|
|
"loss": 0.1376,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"epoch": 1.9513350045760216,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 4.925397080057757e-07,
|
|
"loss": 0.0862,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"epoch": 1.9529266642791772,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 4.7649606930851925e-07,
|
|
"loss": 0.0795,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"epoch": 1.9545183239823327,
|
|
"grad_norm": 6.84375,
|
|
"learning_rate": 4.6045243061126265e-07,
|
|
"loss": 0.0978,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"epoch": 1.956109983685488,
|
|
"grad_norm": 4.53125,
|
|
"learning_rate": 4.444087919140061e-07,
|
|
"loss": 0.098,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"epoch": 1.9577016433886434,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 4.283651532167496e-07,
|
|
"loss": 0.0876,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 1.959293303091799,
|
|
"grad_norm": 5.3125,
|
|
"learning_rate": 4.12321514519493e-07,
|
|
"loss": 0.1219,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"epoch": 1.9608849627949545,
|
|
"grad_norm": 2.984375,
|
|
"learning_rate": 3.962778758222365e-07,
|
|
"loss": 0.1514,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"epoch": 1.96247662249811,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 3.8023423712497995e-07,
|
|
"loss": 0.0855,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"epoch": 1.9626357884684253,
|
|
"eval_loss": 0.15987609326839447,
|
|
"eval_runtime": 17.4187,
|
|
"eval_samples_per_second": 14.582,
|
|
"eval_steps_per_second": 14.582,
|
|
"step": 12331
|
|
},
|
|
{
|
|
"epoch": 1.9640682822012654,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 3.6419059842772345e-07,
|
|
"loss": 0.1002,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"epoch": 1.9656599419044207,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 3.4814695973046685e-07,
|
|
"loss": 0.084,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"epoch": 1.9672516016075763,
|
|
"grad_norm": 6.71875,
|
|
"learning_rate": 3.3210332103321035e-07,
|
|
"loss": 0.1039,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"epoch": 1.9688432613107318,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 3.160596823359538e-07,
|
|
"loss": 0.1531,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"epoch": 1.9704349210138874,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 3.000160436386973e-07,
|
|
"loss": 0.0835,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"epoch": 1.9720265807170427,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 2.8397240494144076e-07,
|
|
"loss": 0.1096,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"epoch": 1.973618240420198,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 2.679287662441842e-07,
|
|
"loss": 0.1108,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 1.9752099001233536,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 2.5188512754692766e-07,
|
|
"loss": 0.1088,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"epoch": 1.9768015598265092,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 2.358414888496711e-07,
|
|
"loss": 0.0737,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"epoch": 1.9783932195296645,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 2.1979785015241458e-07,
|
|
"loss": 0.1121,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"epoch": 1.97998487923282,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 2.0375421145515803e-07,
|
|
"loss": 0.0821,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"epoch": 1.9815765389359754,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.877105727579015e-07,
|
|
"loss": 0.0885,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"epoch": 1.983168198639131,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 1.7166693406064496e-07,
|
|
"loss": 0.1245,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"epoch": 1.9847598583422865,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 1.556232953633884e-07,
|
|
"loss": 0.0911,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"epoch": 1.9863515180454419,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 1.395796566661319e-07,
|
|
"loss": 0.1045,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"epoch": 1.9879431777485972,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 1.2353601796887534e-07,
|
|
"loss": 0.1185,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"epoch": 1.9895348374517527,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 1.0749237927161882e-07,
|
|
"loss": 0.1197,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 1.9911264971549083,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 9.144874057436228e-08,
|
|
"loss": 0.0983,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"epoch": 1.9927181568580639,
|
|
"grad_norm": 6.375,
|
|
"learning_rate": 7.540510187710574e-08,
|
|
"loss": 0.0835,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"epoch": 1.9943098165612192,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 5.936146317984919e-08,
|
|
"loss": 0.1013,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"epoch": 1.9959014762643745,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 4.3317824482592655e-08,
|
|
"loss": 0.1428,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"epoch": 1.9959014762643745,
|
|
"eval_loss": 0.15973736345767975,
|
|
"eval_runtime": 17.5019,
|
|
"eval_samples_per_second": 14.513,
|
|
"eval_steps_per_second": 14.513,
|
|
"step": 12540
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 12566,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 209,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.200062195531776e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|