初始化项目，由ModelHub XC社区提供模型

Model: wmln/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-strong_wise_gecko Source: Original Platform
2026-05-26 15:10:22 +08:00
commit 2b80d59206
15 changed files with 152059 additions and 0 deletions
--- a/trainer_state.json
+++ b/trainer_state.json
@@ -0,0 +1,233 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 20,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 375.875,
+      "epoch": 0.1,
+      "grad_norm": 34.61153793334961,
+      "kl": 0.0,
+      "learning_rate": 4.965903258506806e-07,
+      "loss": 0.0,
+      "reward": 0.3664499084734416,
+      "reward_std": 0.37731685693142936,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0,
+      "rewards/question_recreation_reward_func": 0.24698116456056596,
+      "rewards/soft_format_reward_func": 0.015625,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.10384375043213367,
+      "step": 2
+    },
+    {
+      "completion_length": 356.71875,
+      "epoch": 0.2,
+      "grad_norm": 27.17987060546875,
+      "kl": 0.0014200315781636164,
+      "learning_rate": 4.698684378016222e-07,
+      "loss": 0.0,
+      "reward": 0.44327147863805294,
+      "reward_std": 0.5475165799725801,
+      "rewards/concensus_correctness_reward_func": 0.015687499195337296,
+      "rewards/consensus_reward_func": 0.0625,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0625,
+      "rewards/question_recreation_reward_func": 0.27499022823758423,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.02759375609457493,
+      "step": 4
+    },
+    {
+      "completion_length": 364.78125,
+      "epoch": 0.3,
+      "grad_norm": 8.681589126586914,
+      "kl": 0.01875807526448625,
+      "learning_rate": 4.193203929064353e-07,
+      "loss": 0.0,
+      "reward": 0.3844468754250556,
+      "reward_std": 0.26774020673474297,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0,
+      "rewards/question_recreation_reward_func": 0.31691562850028276,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.06753125367686152,
+      "step": 6
+    },
+    {
+      "completion_length": 363.6875,
+      "epoch": 0.4,
+      "grad_norm": 76.04808807373047,
+      "kl": 0.008640145704703173,
+      "learning_rate": 3.5042385616324236e-07,
+      "loss": 0.0,
+      "reward": 0.1568439636612311,
+      "reward_std": 0.5598289684858173,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0625,
+      "rewards/question_recreation_reward_func": 0.18931270475150086,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": -0.09496874565957114,
+      "step": 8
+    },
+    {
+      "completion_length": 395.5625,
+      "epoch": 0.5,
+      "grad_norm": 8.20077133178711,
+      "kl": 0.00585838263577898,
+      "learning_rate": 2.706448363680831e-07,
+      "loss": 0.0,
+      "reward": 0.44814145751297474,
+      "reward_std": 0.4613042630953714,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0625,
+      "rewards/question_recreation_reward_func": 0.2386414643842727,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.147000000346452,
+      "step": 10
+    },
+    {
+      "completion_length": 333.625,
+      "epoch": 0.6,
+      "grad_norm": 9.751823425292969,
+      "kl": 0.9474406025801727,
+      "learning_rate": 1.886286282148002e-07,
+      "loss": 0.0009,
+      "reward": 0.49279772784211673,
+      "reward_std": 0.411659850156866,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0625,
+      "rewards/question_recreation_reward_func": 0.3130477310915012,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.11725000198930502,
+      "step": 12
+    },
+    {
+      "completion_length": 257.25,
+      "epoch": 0.7,
+      "grad_norm": 7.342770099639893,
+      "kl": 0.00215080863199546,
+      "learning_rate": 1.1326296046939333e-07,
+      "loss": 0.0,
+      "reward": 0.44488345994614065,
+      "reward_std": 0.318172043771483,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0625,
+      "rewards/question_recreation_reward_func": 0.19450845930259675,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": 0.18787499610334635,
+      "step": 14
+    },
+    {
+      "completion_length": 390.46875,
+      "epoch": 0.8,
+      "grad_norm": 4.781615257263184,
+      "kl": 0.0012646604518522508,
+      "learning_rate": 5.271487265090163e-08,
+      "loss": 0.0,
+      "reward": 0.4826530911959708,
+      "reward_std": 0.7335095015587285,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.25,
+      "rewards/question_recreation_reward_func": 0.42434057663194835,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": -0.1916874992166413,
+      "step": 16
+    },
+    {
+      "completion_length": 397.03125,
+      "epoch": 0.9,
+      "grad_norm": 8.631686210632324,
+      "kl": 0.23009972504951293,
+      "learning_rate": 1.3545689574841341e-08,
+      "loss": 0.0002,
+      "reward": 1.1463897689245641,
+      "reward_std": 1.4765515620936185,
+      "rewards/concensus_correctness_reward_func": 0.7297499999403954,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.1875,
+      "rewards/question_recreation_reward_func": 0.2630148070747964,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": -0.03387500322423875,
+      "step": 18
+    },
+    {
+      "completion_length": 307.84375,
+      "epoch": 1.0,
+      "grad_norm": 68.26083374023438,
+      "kl": 3.183381193990499,
+      "learning_rate": 0.0,
+      "loss": 0.0032,
+      "reward": -0.019438669085502625,
+      "reward_std": 0.5264092059223913,
+      "rewards/concensus_correctness_reward_func": 0.0,
+      "rewards/consensus_reward_func": 0.0,
+      "rewards/cumulative_reward_2": 0.0,
+      "rewards/final_correctness_reward_func": 0.0,
+      "rewards/question_recreation_reward_func": 0.16081133193802088,
+      "rewards/soft_format_reward_func": 0.0,
+      "rewards/strict_format_reward_func": 0.0,
+      "rewards/xmlcount_reward_func": -0.18025000300258398,
+      "step": 20
+    },
+    {
+      "epoch": 1.0,
+      "step": 20,
+      "total_flos": 0.0,
+      "train_loss": 0.0004398912084980111,
+      "train_runtime": 308.8876,
+      "train_samples_per_second": 1.036,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 20,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}