Model: huseyinatahaninan/appworld_distillation_sft_v2-SFT-Qwen3-4B-Instruct-2507 Source: Original Platform
704 lines
16 KiB
JSON
704 lines
16 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 30.0,
|
|
"eval_steps": 500,
|
|
"global_step": 60,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 38.81770706176758,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.669,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 42.17606735229492,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": 1.7313,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 1.7900009155273438,
|
|
"eval_runtime": 7.5875,
|
|
"eval_samples_per_second": 4.613,
|
|
"eval_steps_per_second": 0.659,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 40.34153366088867,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 1.662,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 37.466861724853516,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 1.6378,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 1.5364655256271362,
|
|
"eval_runtime": 7.4642,
|
|
"eval_samples_per_second": 4.689,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 29.840675354003906,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 1.4225,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 13.304512977600098,
|
|
"learning_rate": 4.166666666666667e-06,
|
|
"loss": 1.1356,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_loss": 1.1185977458953857,
|
|
"eval_runtime": 7.4596,
|
|
"eval_samples_per_second": 4.692,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 3.5,
|
|
"grad_norm": 10.08704662322998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.0871,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 3.6056466102600098,
|
|
"learning_rate": 4.995770395678171e-06,
|
|
"loss": 0.9041,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_loss": 0.9372425079345703,
|
|
"eval_runtime": 7.4702,
|
|
"eval_samples_per_second": 4.685,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 4.5,
|
|
"grad_norm": 3.1515934467315674,
|
|
"learning_rate": 4.983095894354858e-06,
|
|
"loss": 0.8773,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 2.810807704925537,
|
|
"learning_rate": 4.962019382530521e-06,
|
|
"loss": 0.8762,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_loss": 0.8529078364372253,
|
|
"eval_runtime": 7.4567,
|
|
"eval_samples_per_second": 4.694,
|
|
"eval_steps_per_second": 0.671,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 5.5,
|
|
"grad_norm": 2.751431465148926,
|
|
"learning_rate": 4.93261217644956e-06,
|
|
"loss": 0.8024,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"grad_norm": 3.107816219329834,
|
|
"learning_rate": 4.894973780788722e-06,
|
|
"loss": 0.7807,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_loss": 0.8198402523994446,
|
|
"eval_runtime": 7.4664,
|
|
"eval_samples_per_second": 4.688,
|
|
"eval_steps_per_second": 0.67,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 6.5,
|
|
"grad_norm": 2.681008815765381,
|
|
"learning_rate": 4.849231551964771e-06,
|
|
"loss": 0.758,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"grad_norm": 2.2834079265594482,
|
|
"learning_rate": 4.7955402672006855e-06,
|
|
"loss": 0.7323,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"eval_loss": 0.7645982503890991,
|
|
"eval_runtime": 7.482,
|
|
"eval_samples_per_second": 4.678,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 7.5,
|
|
"grad_norm": 1.8697445392608643,
|
|
"learning_rate": 4.734081600808531e-06,
|
|
"loss": 0.7175,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"grad_norm": 1.4500072002410889,
|
|
"learning_rate": 4.665063509461098e-06,
|
|
"loss": 0.6814,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"eval_loss": 0.7229499816894531,
|
|
"eval_runtime": 7.4866,
|
|
"eval_samples_per_second": 4.675,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 8.5,
|
|
"grad_norm": 1.2065025568008423,
|
|
"learning_rate": 4.588719528532342e-06,
|
|
"loss": 0.6536,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"grad_norm": 1.2752524614334106,
|
|
"learning_rate": 4.50530798188761e-06,
|
|
"loss": 0.6211,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"eval_loss": 0.6847367882728577,
|
|
"eval_runtime": 7.4811,
|
|
"eval_samples_per_second": 4.678,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 9.5,
|
|
"grad_norm": 1.4255502223968506,
|
|
"learning_rate": 4.415111107797445e-06,
|
|
"loss": 0.5956,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"grad_norm": 1.2353219985961914,
|
|
"learning_rate": 4.318434103932622e-06,
|
|
"loss": 0.5738,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"eval_loss": 0.6638691425323486,
|
|
"eval_runtime": 7.4794,
|
|
"eval_samples_per_second": 4.68,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 10.5,
|
|
"grad_norm": 1.1523832082748413,
|
|
"learning_rate": 4.215604094671835e-06,
|
|
"loss": 0.5499,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 11.0,
|
|
"grad_norm": 1.1074179410934448,
|
|
"learning_rate": 4.106969024216348e-06,
|
|
"loss": 0.5171,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 11.0,
|
|
"eval_loss": 0.6498724222183228,
|
|
"eval_runtime": 7.4888,
|
|
"eval_samples_per_second": 4.674,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 11.5,
|
|
"grad_norm": 1.0679000616073608,
|
|
"learning_rate": 3.992896479256966e-06,
|
|
"loss": 0.5088,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"grad_norm": 1.0380491018295288,
|
|
"learning_rate": 3.8737724451770155e-06,
|
|
"loss": 0.4868,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"eval_loss": 0.6385172605514526,
|
|
"eval_runtime": 7.495,
|
|
"eval_samples_per_second": 4.67,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 12.5,
|
|
"grad_norm": 0.8720147609710693,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 0.4697,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 13.0,
|
|
"grad_norm": 0.9460955858230591,
|
|
"learning_rate": 3.621997950501156e-06,
|
|
"loss": 0.4371,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 13.0,
|
|
"eval_loss": 0.6327239871025085,
|
|
"eval_runtime": 7.4987,
|
|
"eval_samples_per_second": 4.667,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 13.5,
|
|
"grad_norm": 0.9779807925224304,
|
|
"learning_rate": 3.4901994150978926e-06,
|
|
"loss": 0.4361,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 14.0,
|
|
"grad_norm": 0.9678999185562134,
|
|
"learning_rate": 3.3550503583141726e-06,
|
|
"loss": 0.407,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 14.0,
|
|
"eval_loss": 0.6305895447731018,
|
|
"eval_runtime": 7.492,
|
|
"eval_samples_per_second": 4.672,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 14.5,
|
|
"grad_norm": 0.9876528978347778,
|
|
"learning_rate": 3.217008081777726e-06,
|
|
"loss": 0.4015,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"grad_norm": 0.9497820734977722,
|
|
"learning_rate": 3.0765396768561005e-06,
|
|
"loss": 0.3924,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"eval_loss": 0.6329755187034607,
|
|
"eval_runtime": 7.4793,
|
|
"eval_samples_per_second": 4.68,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 15.5,
|
|
"grad_norm": 0.9186223745346069,
|
|
"learning_rate": 2.9341204441673267e-06,
|
|
"loss": 0.378,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"grad_norm": 0.9468213319778442,
|
|
"learning_rate": 2.7902322853130758e-06,
|
|
"loss": 0.3505,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"eval_loss": 0.6392822861671448,
|
|
"eval_runtime": 7.4979,
|
|
"eval_samples_per_second": 4.668,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 16.5,
|
|
"grad_norm": 0.9568607807159424,
|
|
"learning_rate": 2.6453620722761897e-06,
|
|
"loss": 0.3475,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 17.0,
|
|
"grad_norm": 1.3522801399230957,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 0.3339,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 17.0,
|
|
"eval_loss": 0.6492648124694824,
|
|
"eval_runtime": 7.479,
|
|
"eval_samples_per_second": 4.68,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 17.5,
|
|
"grad_norm": 0.9427582025527954,
|
|
"learning_rate": 2.3546379277238107e-06,
|
|
"loss": 0.3202,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 18.0,
|
|
"grad_norm": 1.018237829208374,
|
|
"learning_rate": 2.2097677146869242e-06,
|
|
"loss": 0.3086,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 18.0,
|
|
"eval_loss": 0.6622989773750305,
|
|
"eval_runtime": 7.4939,
|
|
"eval_samples_per_second": 4.67,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 18.5,
|
|
"grad_norm": 0.9453594088554382,
|
|
"learning_rate": 2.0658795558326745e-06,
|
|
"loss": 0.3004,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 19.0,
|
|
"grad_norm": 1.172818899154663,
|
|
"learning_rate": 1.9234603231439e-06,
|
|
"loss": 0.2803,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 19.0,
|
|
"eval_loss": 0.6748006343841553,
|
|
"eval_runtime": 7.4995,
|
|
"eval_samples_per_second": 4.667,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 19.5,
|
|
"grad_norm": 1.2339236736297607,
|
|
"learning_rate": 1.7829919182222752e-06,
|
|
"loss": 0.2751,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"grad_norm": 1.0225498676300049,
|
|
"learning_rate": 1.6449496416858285e-06,
|
|
"loss": 0.2687,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"eval_loss": 0.6873091459274292,
|
|
"eval_runtime": 7.4881,
|
|
"eval_samples_per_second": 4.674,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 20.5,
|
|
"grad_norm": 0.9979678392410278,
|
|
"learning_rate": 1.509800584902108e-06,
|
|
"loss": 0.2556,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 21.0,
|
|
"grad_norm": 0.9665634632110596,
|
|
"learning_rate": 1.3780020494988447e-06,
|
|
"loss": 0.25,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 21.0,
|
|
"eval_loss": 0.6983169913291931,
|
|
"eval_runtime": 7.4891,
|
|
"eval_samples_per_second": 4.673,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 21.5,
|
|
"grad_norm": 1.0296282768249512,
|
|
"learning_rate": 1.2500000000000007e-06,
|
|
"loss": 0.238,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 22.0,
|
|
"grad_norm": 1.0459901094436646,
|
|
"learning_rate": 1.1262275548229852e-06,
|
|
"loss": 0.2306,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 22.0,
|
|
"eval_loss": 0.7099719047546387,
|
|
"eval_runtime": 7.5003,
|
|
"eval_samples_per_second": 4.667,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 22.5,
|
|
"grad_norm": 1.0921026468276978,
|
|
"learning_rate": 1.0071035207430352e-06,
|
|
"loss": 0.227,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 23.0,
|
|
"grad_norm": 0.9770745635032654,
|
|
"learning_rate": 8.930309757836517e-07,
|
|
"loss": 0.2168,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 23.0,
|
|
"eval_loss": 0.720533549785614,
|
|
"eval_runtime": 7.4879,
|
|
"eval_samples_per_second": 4.674,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 23.5,
|
|
"grad_norm": 1.0788757801055908,
|
|
"learning_rate": 7.843959053281663e-07,
|
|
"loss": 0.2105,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"grad_norm": 1.103110432624817,
|
|
"learning_rate": 6.815658960673782e-07,
|
|
"loss": 0.2125,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"eval_loss": 0.7300633192062378,
|
|
"eval_runtime": 7.479,
|
|
"eval_samples_per_second": 4.68,
|
|
"eval_steps_per_second": 0.669,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 24.5,
|
|
"grad_norm": 0.9235285520553589,
|
|
"learning_rate": 5.848888922025553e-07,
|
|
"loss": 0.2052,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"grad_norm": 1.4067970514297485,
|
|
"learning_rate": 4.946920181123904e-07,
|
|
"loss": 0.2031,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"eval_loss": 0.7378360629081726,
|
|
"eval_runtime": 7.5148,
|
|
"eval_samples_per_second": 4.657,
|
|
"eval_steps_per_second": 0.665,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 25.5,
|
|
"grad_norm": 0.960370659828186,
|
|
"learning_rate": 4.1128047146765936e-07,
|
|
"loss": 0.2048,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 26.0,
|
|
"grad_norm": 1.2136261463165283,
|
|
"learning_rate": 3.3493649053890325e-07,
|
|
"loss": 0.1975,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 26.0,
|
|
"eval_loss": 0.7432867288589478,
|
|
"eval_runtime": 7.4969,
|
|
"eval_samples_per_second": 4.669,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 26.5,
|
|
"grad_norm": 1.0613588094711304,
|
|
"learning_rate": 2.6591839919146963e-07,
|
|
"loss": 0.1925,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 27.0,
|
|
"grad_norm": 1.117826223373413,
|
|
"learning_rate": 2.044597327993153e-07,
|
|
"loss": 0.2001,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 27.0,
|
|
"eval_loss": 0.7474139928817749,
|
|
"eval_runtime": 7.4824,
|
|
"eval_samples_per_second": 4.678,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 27.5,
|
|
"grad_norm": 1.350067138671875,
|
|
"learning_rate": 1.507684480352292e-07,
|
|
"loss": 0.1942,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 28.0,
|
|
"grad_norm": 0.8971886038780212,
|
|
"learning_rate": 1.0502621921127776e-07,
|
|
"loss": 0.1953,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 28.0,
|
|
"eval_loss": 0.7486553192138672,
|
|
"eval_runtime": 7.4958,
|
|
"eval_samples_per_second": 4.669,
|
|
"eval_steps_per_second": 0.667,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 28.5,
|
|
"grad_norm": 0.9421606063842773,
|
|
"learning_rate": 6.738782355044048e-08,
|
|
"loss": 0.1883,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 29.0,
|
|
"grad_norm": 1.1213371753692627,
|
|
"learning_rate": 3.798061746947995e-08,
|
|
"loss": 0.1895,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 29.0,
|
|
"eval_loss": 0.7486764788627625,
|
|
"eval_runtime": 7.4875,
|
|
"eval_samples_per_second": 4.674,
|
|
"eval_steps_per_second": 0.668,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 29.5,
|
|
"grad_norm": 1.0604745149612427,
|
|
"learning_rate": 1.6904105645142443e-08,
|
|
"loss": 0.1886,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"grad_norm": 1.0217480659484863,
|
|
"learning_rate": 4.229604321829561e-09,
|
|
"loss": 0.1976,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"eval_loss": 0.7485920190811157,
|
|
"eval_runtime": 7.2053,
|
|
"eval_samples_per_second": 4.858,
|
|
"eval_steps_per_second": 0.694,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 30.0,
|
|
"step": 60,
|
|
"total_flos": 1.0166485230865613e+18,
|
|
"train_loss": 0.5314400238295396,
|
|
"train_runtime": 1496.8474,
|
|
"train_samples_per_second": 1.022,
|
|
"train_steps_per_second": 0.04
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 60,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 30,
|
|
"save_steps": 2000000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.0166485230865613e+18,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|