{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0014534883720930232, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10086.0, "completions/max_terminated_length": 10086.0, "completions/mean_length": 4296.546875, "completions/mean_terminated_length": 4296.546875, "completions/min_length": 1720.0, "completions/min_terminated_length": 1720.0, "epoch": 2.4224806201550387e-05, "grad_norm": 0.016954593260394005, "kl": 0.0009393692016601562, "learning_rate": 0.0, "loss": -0.0003, "num_tokens": 601834.0, "reward": 0.4602593183517456, "reward_std": 0.24803586304187775, "rewards/avg_thinking_length_func": 185.02471923828125, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.8889554441999474, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.7176268100738525, "rewards/tool_execution_reward_func": 1.9936248064041138, "rewards/visit_tool_reward_func": 0.9308543801307678, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.8449612403100775e-05, "grad_norm": 0.016953615886545852, "kl": 0.0009393692016601562, "learning_rate": 6.25e-08, "loss": -0.0003, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.267441860465116e-05, "grad_norm": 0.016864690676516626, "kl": 0.0009565353393554688, "learning_rate": 1.25e-07, "loss": -0.0003, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.689922480620155e-05, "grad_norm": 0.016822420848305722, "kl": 0.0009622573852539062, "learning_rate": 1.875e-07, "loss": -0.0003, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9385.0, "completions/max_terminated_length": 9385.0, "completions/mean_length": 4270.703125, "completions/mean_terminated_length": 4270.703125, "completions/min_length": 1390.0, "completions/min_terminated_length": 1390.0, "epoch": 0.00012112403100775194, "grad_norm": 0.025862550499858347, "kl": 0.000957489013671875, "learning_rate": 2.5e-07, "loss": 0.0031, "num_tokens": 1199795.0, "reward": 0.566771388053894, "reward_std": 0.48137491941452026, "rewards/avg_thinking_length_func": 182.33303833007812, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.8707049785861538, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.7195165157318115, "rewards/tool_execution_reward_func": 1.9965277910232544, "rewards/visit_tool_reward_func": 0.9274243116378784, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00014534883720930232, "grad_norm": 0.025877236026611388, "kl": 0.0009489059448242188, "learning_rate": 3.1249999999999997e-07, "loss": 0.0031, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001695736434108527, "grad_norm": 0.025817236127475232, "kl": 0.0009660720825195312, "learning_rate": 3.75e-07, "loss": 0.0031, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001937984496124031, "grad_norm": 0.02584169829863559, "kl": 0.0009441375732421875, "learning_rate": 4.375e-07, "loss": 0.0031, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7008.0, "completions/max_terminated_length": 7008.0, "completions/mean_length": 4088.546875, "completions/mean_terminated_length": 4088.546875, "completions/min_length": 1705.0, "completions/min_terminated_length": 1705.0, "epoch": 0.00021802325581395349, "grad_norm": 0.01625597308376849, "kl": 0.0009918212890625, "learning_rate": 5e-07, "loss": 0.0013, "num_tokens": 1783761.0, "reward": 0.3732198178768158, "reward_std": 0.2907864451408386, "rewards/avg_thinking_length_func": 177.95510864257812, "rewards/correct_answer_reward_func": 0.390625, "rewards/efficient_thinking_reward_func": 0.8993925619789238, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.6866124868392944, "rewards/tool_execution_reward_func": 1.950781226158142, "rewards/visit_tool_reward_func": 0.8574961423873901, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00024224806201550387, "grad_norm": 0.016618535814852814, "kl": 0.0009899139404296875, "learning_rate": 5.625e-07, "loss": 0.0013, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00026647286821705426, "grad_norm": 0.016248156263205492, "kl": 0.0009660720825195312, "learning_rate": 6.249999999999999e-07, "loss": 0.0013, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00029069767441860465, "grad_norm": 0.016111032400620007, "kl": 0.0009870529174804688, "learning_rate": 6.875e-07, "loss": 0.0013, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6572.0, "completions/max_terminated_length": 6572.0, "completions/mean_length": 4119.703125, "completions/mean_terminated_length": 4119.703125, "completions/min_length": 1356.0, "completions/min_terminated_length": 1356.0, "epoch": 0.00031492248062015503, "grad_norm": 0.019643777904198217, "kl": 0.0009822845458984375, "learning_rate": 7.5e-07, "loss": -0.0008, "num_tokens": 2367034.0, "reward": 0.6774564981460571, "reward_std": 0.3563808798789978, "rewards/avg_thinking_length_func": 176.69476318359375, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.8704519537344548, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.6578426361083984, "rewards/tool_execution_reward_func": 2.0, "rewards/visit_tool_reward_func": 0.9361900091171265, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003391472868217054, "grad_norm": 0.0194815826710202, "kl": 0.0010242462158203125, "learning_rate": 8.125e-07, "loss": -0.0008, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003633720930232558, "grad_norm": 0.019402854833833996, "kl": 0.0010585784912109375, "learning_rate": 8.75e-07, "loss": -0.0008, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003875968992248062, "grad_norm": 0.019438299719581362, "kl": 0.0011272430419921875, "learning_rate": 9.374999999999999e-07, "loss": -0.0008, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7597.0, "completions/max_terminated_length": 7597.0, "completions/mean_length": 4205.671875, "completions/mean_terminated_length": 4205.671875, "completions/min_length": 1507.0, "completions/min_terminated_length": 1507.0, "epoch": 0.0004118217054263566, "grad_norm": 0.014823687168402296, "kl": 0.0011005401611328125, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 2985545.0, "reward": 0.3260263204574585, "reward_std": 0.2300996333360672, "rewards/avg_thinking_length_func": 177.14329528808594, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 0.8988714947132084, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.8095711469650269, "rewards/tool_execution_reward_func": 1.99609375, "rewards/visit_tool_reward_func": 0.852025032043457, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00043604651162790697, "grad_norm": 0.014727006858324664, "kl": 0.0011577606201171875, "learning_rate": 1.0625e-06, "loss": 0.0009, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00046027131782945736, "grad_norm": 0.014837711956269274, "kl": 0.0012722015380859375, "learning_rate": 1.125e-06, "loss": 0.0009, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00048449612403100775, "grad_norm": 0.014894430575329584, "kl": 0.00146484375, "learning_rate": 1.1874999999999999e-06, "loss": 0.0009, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7476.0, "completions/max_terminated_length": 7476.0, "completions/mean_length": 4097.921875, "completions/mean_terminated_length": 4097.921875, "completions/min_length": 1514.0, "completions/min_terminated_length": 1514.0, "epoch": 0.0005087209302325581, "grad_norm": 0.0189498267274778, "kl": 0.0019931793212890625, "learning_rate": 1.2499999999999999e-06, "loss": 0.0003, "num_tokens": 3561495.0, "reward": 0.5717383623123169, "reward_std": 0.33007949590682983, "rewards/avg_thinking_length_func": 177.5142822265625, "rewards/correct_answer_reward_func": 0.53125, "rewards/efficient_thinking_reward_func": 0.8662384906971484, "rewards/format_reward_func": 0.9937499761581421, "rewards/num_xml_reward_func": 1.779766321182251, "rewards/tool_execution_reward_func": 1.979819416999817, "rewards/visit_tool_reward_func": 0.9004297256469727, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005329457364341085, "grad_norm": 0.019010527717988047, "kl": 0.00229644775390625, "learning_rate": 1.3125e-06, "loss": 0.0003, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005571705426356589, "grad_norm": 0.01910688815244073, "kl": 0.00276947021484375, "learning_rate": 1.375e-06, "loss": 0.0003, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005813953488372093, "grad_norm": 0.019047374161024387, "kl": 0.00327301025390625, "learning_rate": 1.4375e-06, "loss": 0.0003, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7779.0, "completions/max_terminated_length": 7779.0, "completions/mean_length": 4011.9375, "completions/mean_terminated_length": 4011.9375, "completions/min_length": 1884.0, "completions/min_terminated_length": 1884.0, "epoch": 0.0006056201550387597, "grad_norm": 0.01969391991938911, "kl": 0.0029449462890625, "learning_rate": 1.5e-06, "loss": 0.0003, "num_tokens": 4148002.0, "reward": 0.4466557502746582, "reward_std": 0.2478387951850891, "rewards/avg_thinking_length_func": 174.6974639892578, "rewards/correct_answer_reward_func": 0.40625, "rewards/efficient_thinking_reward_func": 0.9054659197504085, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.806973934173584, "rewards/tool_execution_reward_func": 1.9922122955322266, "rewards/visit_tool_reward_func": 0.871803879737854, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006298449612403101, "grad_norm": 0.01979038843755439, "kl": 0.003414154052734375, "learning_rate": 1.5624999999999999e-06, "loss": 0.0003, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006540697674418605, "grad_norm": 0.019676702255338734, "kl": 0.004245758056640625, "learning_rate": 1.625e-06, "loss": 0.0003, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006782945736434108, "grad_norm": 0.0198896583655868, "kl": 0.00508880615234375, "learning_rate": 1.6875e-06, "loss": 0.0003, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7881.0, "completions/max_terminated_length": 7881.0, "completions/mean_length": 4278.0, "completions/mean_terminated_length": 4278.0, "completions/min_length": 1269.0, "completions/min_terminated_length": 1269.0, "epoch": 0.0007025193798449612, "grad_norm": 0.02473412222614823, "kl": 0.00722503662109375, "learning_rate": 1.75e-06, "loss": 0.0005, "num_tokens": 4732732.0, "reward": 0.639769971370697, "reward_std": 0.3489268720149994, "rewards/avg_thinking_length_func": 183.79090881347656, "rewards/correct_answer_reward_func": 0.640625, "rewards/efficient_thinking_reward_func": 0.8433743364598003, "rewards/format_reward_func": 0.9991071224212646, "rewards/num_xml_reward_func": 1.686936616897583, "rewards/tool_execution_reward_func": 1.9818710088729858, "rewards/visit_tool_reward_func": 0.923589289188385, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007267441860465116, "grad_norm": 0.024757116664213524, "kl": 0.0076904296875, "learning_rate": 1.8125e-06, "loss": 0.0005, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000750968992248062, "grad_norm": 0.02444644630643307, "kl": 0.0073394775390625, "learning_rate": 1.8749999999999998e-06, "loss": 0.0005, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007751937984496124, "grad_norm": 0.024210451469423133, "kl": 0.007171630859375, "learning_rate": 1.9375e-06, "loss": 0.0005, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7912.0, "completions/max_terminated_length": 7912.0, "completions/mean_length": 4317.890625, "completions/mean_terminated_length": 4317.890625, "completions/min_length": 1736.0, "completions/min_terminated_length": 1736.0, "epoch": 0.0007994186046511628, "grad_norm": 0.020658762871057952, "kl": 0.007049560546875, "learning_rate": 2e-06, "loss": -0.0, "num_tokens": 5347783.0, "reward": 0.33683592081069946, "reward_std": 0.32624948024749756, "rewards/avg_thinking_length_func": 177.01129150390625, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 0.8817601664392056, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.5408036708831787, "rewards/tool_execution_reward_func": 1.9917367696762085, "rewards/visit_tool_reward_func": 0.9276807308197021, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008236434108527132, "grad_norm": 0.02072632567074888, "kl": 0.0077972412109375, "learning_rate": 2e-06, "loss": -0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008478682170542636, "grad_norm": 0.020770020029080613, "kl": 0.0087432861328125, "learning_rate": 2e-06, "loss": -0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008720930232558139, "grad_norm": 0.020487067102301602, "kl": 0.0097198486328125, "learning_rate": 2e-06, "loss": -0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7378.0, "completions/max_terminated_length": 7378.0, "completions/mean_length": 4152.5, "completions/mean_terminated_length": 4152.5, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "epoch": 0.0008963178294573643, "grad_norm": 0.022364107178309313, "kl": 0.0112152099609375, "learning_rate": 2e-06, "loss": -0.0001, "num_tokens": 5921090.0, "reward": 0.6556656360626221, "reward_std": 0.5008378028869629, "rewards/avg_thinking_length_func": 170.4791259765625, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.8892575272805912, "rewards/format_reward_func": 0.987500011920929, "rewards/num_xml_reward_func": 1.5408031940460205, "rewards/tool_execution_reward_func": 1.96875, "rewards/visit_tool_reward_func": 0.9249746799468994, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009205426356589147, "grad_norm": 0.022597206540891295, "kl": 0.0123443603515625, "learning_rate": 2e-06, "loss": -0.0001, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009447674418604651, "grad_norm": 0.02246679376217943, "kl": 0.013580322265625, "learning_rate": 2e-06, "loss": -0.0001, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009689922480620155, "grad_norm": 0.022296105800735398, "kl": 0.015106201171875, "learning_rate": 2e-06, "loss": -0.0001, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7494.0, "completions/max_terminated_length": 7494.0, "completions/mean_length": 4562.296875, "completions/mean_terminated_length": 4562.296875, "completions/min_length": 2143.0, "completions/min_terminated_length": 2143.0, "epoch": 0.0009932170542635659, "grad_norm": 0.021503135345542313, "kl": 0.015594482421875, "learning_rate": 2e-06, "loss": 0.0007, "num_tokens": 6556719.0, "reward": 0.47225743532180786, "reward_std": 0.3904932141304016, "rewards/avg_thinking_length_func": 169.57839965820312, "rewards/correct_answer_reward_func": 0.4375, "rewards/efficient_thinking_reward_func": 0.917264621947748, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.817958116531372, "rewards/tool_execution_reward_func": 1.9884111881256104, "rewards/visit_tool_reward_func": 0.9651369452476501, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010174418604651163, "grad_norm": 0.02149252867250571, "kl": 0.01715087890625, "learning_rate": 2e-06, "loss": 0.0007, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010416666666666667, "grad_norm": 0.02173596902997293, "kl": 0.018798828125, "learning_rate": 2e-06, "loss": 0.0007, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001065891472868217, "grad_norm": 0.02188237517399594, "kl": 0.020751953125, "learning_rate": 2e-06, "loss": 0.0007, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9017.0, "completions/max_terminated_length": 9017.0, "completions/mean_length": 4664.796875, "completions/mean_terminated_length": 4664.796875, "completions/min_length": 1910.0, "completions/min_terminated_length": 1910.0, "epoch": 0.0010901162790697674, "grad_norm": 0.02354857583102173, "kl": 0.020477294921875, "learning_rate": 2e-06, "loss": -0.0014, "num_tokens": 7181732.0, "reward": 0.7991669178009033, "reward_std": 0.36247026920318604, "rewards/avg_thinking_length_func": 171.8461151123047, "rewards/correct_answer_reward_func": 0.703125, "rewards/efficient_thinking_reward_func": 0.8915984372821139, "rewards/format_reward_func": 0.9998437166213989, "rewards/num_xml_reward_func": 1.8501074314117432, "rewards/tool_execution_reward_func": 1.9971354007720947, "rewards/visit_tool_reward_func": 1.071668028831482, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011143410852713178, "grad_norm": 0.023994471938115103, "kl": 0.0224609375, "learning_rate": 2e-06, "loss": -0.0014, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011385658914728682, "grad_norm": 0.026516939220345738, "kl": 0.02508544921875, "learning_rate": 2e-06, "loss": -0.0014, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011627906976744186, "grad_norm": 0.024485287814160223, "kl": 0.0262451171875, "learning_rate": 2e-06, "loss": -0.0014, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8522.0, "completions/max_terminated_length": 8522.0, "completions/mean_length": 4866.125, "completions/mean_terminated_length": 4866.125, "completions/min_length": 1959.0, "completions/min_terminated_length": 1959.0, "epoch": 0.001187015503875969, "grad_norm": 0.02407332594201, "kl": 0.032012939453125, "learning_rate": 2e-06, "loss": 0.0014, "num_tokens": 7868034.0, "reward": 0.39128515124320984, "reward_std": 0.3533371090888977, "rewards/avg_thinking_length_func": 164.74734497070312, "rewards/correct_answer_reward_func": 0.359375, "rewards/efficient_thinking_reward_func": 0.9209367558816545, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.6406757831573486, "rewards/tool_execution_reward_func": 1.98927903175354, "rewards/visit_tool_reward_func": 1.0120830535888672, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012112403100775194, "grad_norm": 0.02479690454991753, "kl": 0.035888671875, "learning_rate": 2e-06, "loss": 0.0014, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012354651162790698, "grad_norm": 0.027012142633289393, "kl": 0.04046630859375, "learning_rate": 2e-06, "loss": 0.0014, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012596899224806201, "grad_norm": 0.026499465739179152, "kl": 0.04803466796875, "learning_rate": 2e-06, "loss": 0.0014, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7622.0, "completions/max_terminated_length": 7622.0, "completions/mean_length": 4509.75, "completions/mean_terminated_length": 4509.75, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 0.0012839147286821705, "grad_norm": 0.019741394516818018, "kl": 0.04510498046875, "learning_rate": 2e-06, "loss": 0.0, "num_tokens": 8481102.0, "reward": 0.7655854225158691, "reward_std": 0.27847254276275635, "rewards/avg_thinking_length_func": 158.9434051513672, "rewards/correct_answer_reward_func": 0.671875, "rewards/efficient_thinking_reward_func": 0.884494477975468, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.8834664821624756, "rewards/tool_execution_reward_func": 2.0, "rewards/visit_tool_reward_func": 1.1049017906188965, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001308139534883721, "grad_norm": 0.028517188784132036, "kl": 0.06060791015625, "learning_rate": 2e-06, "loss": 0.0001, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0013323643410852713, "grad_norm": 0.02643367822401968, "kl": 0.06280517578125, "learning_rate": 2e-06, "loss": 0.0001, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0013565891472868217, "grad_norm": 0.020594752118506976, "kl": 0.056884765625, "learning_rate": 2e-06, "loss": 0.0001, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7375.0, "completions/max_terminated_length": 7375.0, "completions/mean_length": 4285.046875, "completions/mean_terminated_length": 4285.046875, "completions/min_length": 2418.0, "completions/min_terminated_length": 2418.0, "epoch": 0.001380813953488372, "grad_norm": 0.019100627823517295, "kl": 0.06195068359375, "learning_rate": 2e-06, "loss": 0.0005, "num_tokens": 9112297.0, "reward": 0.5274717807769775, "reward_std": 0.2380232810974121, "rewards/avg_thinking_length_func": 145.75924682617188, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.9274070198828231, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.7929463386535645, "rewards/tool_execution_reward_func": 1.9959805011749268, "rewards/visit_tool_reward_func": 1.0335674285888672, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014050387596899225, "grad_norm": 0.019834849658967178, "kl": 0.06695556640625, "learning_rate": 2e-06, "loss": 0.0005, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014292635658914728, "grad_norm": 0.020359737753586633, "kl": 0.0740966796875, "learning_rate": 2e-06, "loss": 0.0005, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014534883720930232, "grad_norm": 0.020904893352951728, "kl": 0.085693359375, "learning_rate": 2e-06, "loss": 0.0005, "step": 60 } ], "logging_steps": 1, "max_steps": 640, "num_input_tokens_seen": 9112297, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }