初始化项目，由ModelHub XC社区提供模型

Model: fpadovani/tur_indomain_prepretraining_seed577 Source: Original Platform
2026-05-30 04:36:21 +08:00
commit 7dbe71f216
97 changed files with 2033223 additions and 0 deletions
--- a/checkpoint-500/config.json
+++ b/checkpoint-500/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-500/generation_config.json
+++ b/checkpoint-500/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-500/model.safetensors
+++ b/checkpoint-500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c9e376ce556a5336310da4308b7abbf5a49d021629da77f8eebad222b6761b1
+size 79752272
--- a/checkpoint-500/optimizer.pt
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c67791844b3ab1e1c118a1440d1be978538e6063429cf1062c9850223bf80bb3
+size 159538443
--- a/checkpoint-500/rng_state.pth
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:678c42a1f81ca322dd86dd40209eda932f87e51895fd1824b1d43ced2a361ea4
+size 14645
--- a/checkpoint-500/scheduler.pt
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f243d020d2631ea115c3f5d3e6c1e5c9a7f8334b45c5f10056444d8c3da615c
+size 1465
--- a/checkpoint-500/special_tokens_map.json
+++ b/checkpoint-500/special_tokens_map.json
--- a/checkpoint-500/tokenizer.json
+++ b/checkpoint-500/tokenizer.json
--- a/checkpoint-500/tokenizer_config.json
+++ b/checkpoint-500/tokenizer_config.json
--- a/checkpoint-500/trainer_state.json
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,733 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0653893938403191,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000653893938403191,
+      "grad_norm": 7.78125,
+      "learning_rate": 1e-05,
+      "loss": 10.9314,
+      "step": 5
+    },
+    {
+      "epoch": 0.001307787876806382,
+      "grad_norm": 7.15625,
+      "learning_rate": 2e-05,
+      "loss": 10.8618,
+      "step": 10
+    },
+    {
+      "epoch": 0.001961681815209573,
+      "grad_norm": 8.625,
+      "learning_rate": 3e-05,
+      "loss": 10.6602,
+      "step": 15
+    },
+    {
+      "epoch": 0.002615575753612764,
+      "grad_norm": 3.78125,
+      "learning_rate": 4e-05,
+      "loss": 10.4672,
+      "step": 20
+    },
+    {
+      "epoch": 0.003269469692015955,
+      "grad_norm": 3.21875,
+      "learning_rate": 5e-05,
+      "loss": 10.3043,
+      "step": 25
+    },
+    {
+      "epoch": 0.003923363630419146,
+      "grad_norm": 2.921875,
+      "learning_rate": 6e-05,
+      "loss": 10.2382,
+      "step": 30
+    },
+    {
+      "epoch": 0.004577257568822337,
+      "grad_norm": 2.84375,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 10.072,
+      "step": 35
+    },
+    {
+      "epoch": 0.005231151507225528,
+      "grad_norm": 2.84375,
+      "learning_rate": 8e-05,
+      "loss": 9.9208,
+      "step": 40
+    },
+    {
+      "epoch": 0.005885045445628719,
+      "grad_norm": 2.5,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 9.7828,
+      "step": 45
+    },
+    {
+      "epoch": 0.00653893938403191,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0001,
+      "loss": 9.5983,
+      "step": 50
+    },
+    {
+      "epoch": 0.007192833322435101,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.00011,
+      "loss": 9.3536,
+      "step": 55
+    },
+    {
+      "epoch": 0.007846727260838291,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.00012,
+      "loss": 9.1994,
+      "step": 60
+    },
+    {
+      "epoch": 0.008500621199241483,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 9.0285,
+      "step": 65
+    },
+    {
+      "epoch": 0.009154515137644674,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 8.8309,
+      "step": 70
+    },
+    {
+      "epoch": 0.009808409076047865,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00015,
+      "loss": 8.7217,
+      "step": 75
+    },
+    {
+      "epoch": 0.010462303014451056,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00016,
+      "loss": 8.5904,
+      "step": 80
+    },
+    {
+      "epoch": 0.011116196952854247,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00017,
+      "loss": 8.5347,
+      "step": 85
+    },
+    {
+      "epoch": 0.011770090891257438,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 8.5,
+      "step": 90
+    },
+    {
+      "epoch": 0.01242398482966063,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00019,
+      "loss": 8.4191,
+      "step": 95
+    },
+    {
+      "epoch": 0.01307787876806382,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0002,
+      "loss": 8.4152,
+      "step": 100
+    },
+    {
+      "epoch": 0.013731772706467011,
+      "grad_norm": 1.25,
+      "learning_rate": 0.00021,
+      "loss": 8.4117,
+      "step": 105
+    },
+    {
+      "epoch": 0.014385666644870202,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00022,
+      "loss": 8.3862,
+      "step": 110
+    },
+    {
+      "epoch": 0.015039560583273394,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00023,
+      "loss": 8.3845,
+      "step": 115
+    },
+    {
+      "epoch": 0.015693454521676583,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00024,
+      "loss": 8.3856,
+      "step": 120
+    },
+    {
+      "epoch": 0.016347348460079774,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.00025,
+      "loss": 8.3267,
+      "step": 125
+    },
+    {
+      "epoch": 0.017001242398482965,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 8.3117,
+      "step": 130
+    },
+    {
+      "epoch": 0.017655136336886156,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00027,
+      "loss": 8.2819,
+      "step": 135
+    },
+    {
+      "epoch": 0.018309030275289347,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 8.3206,
+      "step": 140
+    },
+    {
+      "epoch": 0.01896292421369254,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00029,
+      "loss": 8.2799,
+      "step": 145
+    },
+    {
+      "epoch": 0.01961681815209573,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0003,
+      "loss": 8.2797,
+      "step": 150
+    },
+    {
+      "epoch": 0.02027071209049892,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00031,
+      "loss": 8.2185,
+      "step": 155
+    },
+    {
+      "epoch": 0.02092460602890211,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00032,
+      "loss": 8.1803,
+      "step": 160
+    },
+    {
+      "epoch": 0.021578499967305303,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.00033,
+      "loss": 8.1965,
+      "step": 165
+    },
+    {
+      "epoch": 0.022232393905708494,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00034,
+      "loss": 8.1966,
+      "step": 170
+    },
+    {
+      "epoch": 0.022886287844111685,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.00035,
+      "loss": 8.1858,
+      "step": 175
+    },
+    {
+      "epoch": 0.023540181782514876,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 8.1155,
+      "step": 180
+    },
+    {
+      "epoch": 0.024194075720918067,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00037,
+      "loss": 8.0795,
+      "step": 185
+    },
+    {
+      "epoch": 0.02484796965932126,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.00038,
+      "loss": 8.0587,
+      "step": 190
+    },
+    {
+      "epoch": 0.02550186359772445,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 8.0238,
+      "step": 195
+    },
+    {
+      "epoch": 0.02615575753612764,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0004,
+      "loss": 8.0203,
+      "step": 200
+    },
+    {
+      "epoch": 0.02680965147453083,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00041,
+      "loss": 8.0557,
+      "step": 205
+    },
+    {
+      "epoch": 0.027463545412934023,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00042,
+      "loss": 8.0153,
+      "step": 210
+    },
+    {
+      "epoch": 0.028117439351337214,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00043,
+      "loss": 8.0324,
+      "step": 215
+    },
+    {
+      "epoch": 0.028771333289740405,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.00044,
+      "loss": 7.943,
+      "step": 220
+    },
+    {
+      "epoch": 0.029425227228143596,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 7.9174,
+      "step": 225
+    },
+    {
+      "epoch": 0.030079121166546787,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00046,
+      "loss": 7.9345,
+      "step": 230
+    },
+    {
+      "epoch": 0.030733015104949978,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00047,
+      "loss": 7.8904,
+      "step": 235
+    },
+    {
+      "epoch": 0.031386909043353166,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00048,
+      "loss": 7.9578,
+      "step": 240
+    },
+    {
+      "epoch": 0.03204080298175636,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00049,
+      "loss": 7.8584,
+      "step": 245
+    },
+    {
+      "epoch": 0.03269469692015955,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0005,
+      "loss": 7.8777,
+      "step": 250
+    },
+    {
+      "epoch": 0.03334859085856274,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00051,
+      "loss": 7.8879,
+      "step": 255
+    },
+    {
+      "epoch": 0.03400248479696593,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 7.8749,
+      "step": 260
+    },
+    {
+      "epoch": 0.03465637873536912,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0005300000000000001,
+      "loss": 7.8588,
+      "step": 265
+    },
+    {
+      "epoch": 0.03531027267377231,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.00054,
+      "loss": 7.8164,
+      "step": 270
+    },
+    {
+      "epoch": 0.035964166612175504,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.00055,
+      "loss": 7.8153,
+      "step": 275
+    },
+    {
+      "epoch": 0.036618060550578695,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 7.8396,
+      "step": 280
+    },
+    {
+      "epoch": 0.037271954488981886,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00057,
+      "loss": 7.8175,
+      "step": 285
+    },
+    {
+      "epoch": 0.03792584842738508,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00058,
+      "loss": 7.8173,
+      "step": 290
+    },
+    {
+      "epoch": 0.03857974236578827,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.00059,
+      "loss": 7.7171,
+      "step": 295
+    },
+    {
+      "epoch": 0.03923363630419146,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0006,
+      "loss": 7.7434,
+      "step": 300
+    },
+    {
+      "epoch": 0.03988753024259465,
+      "grad_norm": 1.75,
+      "learning_rate": 0.00061,
+      "loss": 7.7722,
+      "step": 305
+    },
+    {
+      "epoch": 0.04054142418099784,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00062,
+      "loss": 7.7211,
+      "step": 310
+    },
+    {
+      "epoch": 0.04119531811940103,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00063,
+      "loss": 7.7748,
+      "step": 315
+    },
+    {
+      "epoch": 0.04184921205780422,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00064,
+      "loss": 7.6554,
+      "step": 320
+    },
+    {
+      "epoch": 0.042503105996207415,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 7.6331,
+      "step": 325
+    },
+    {
+      "epoch": 0.043156999934610606,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00066,
+      "loss": 7.7316,
+      "step": 330
+    },
+    {
+      "epoch": 0.0438108938730138,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00067,
+      "loss": 7.6819,
+      "step": 335
+    },
+    {
+      "epoch": 0.04446478781141699,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00068,
+      "loss": 7.6488,
+      "step": 340
+    },
+    {
+      "epoch": 0.04511868174982018,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00069,
+      "loss": 7.5754,
+      "step": 345
+    },
+    {
+      "epoch": 0.04577257568822337,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0007,
+      "loss": 7.6011,
+      "step": 350
+    },
+    {
+      "epoch": 0.04642646962662656,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00071,
+      "loss": 7.5742,
+      "step": 355
+    },
+    {
+      "epoch": 0.04708036356502975,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 7.5733,
+      "step": 360
+    },
+    {
+      "epoch": 0.04773425750343294,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00073,
+      "loss": 7.5406,
+      "step": 365
+    },
+    {
+      "epoch": 0.048388151441836134,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00074,
+      "loss": 7.5902,
+      "step": 370
+    },
+    {
+      "epoch": 0.049042045380239326,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00075,
+      "loss": 7.5615,
+      "step": 375
+    },
+    {
+      "epoch": 0.04969593931864252,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00076,
+      "loss": 7.5224,
+      "step": 380
+    },
+    {
+      "epoch": 0.05034983325704571,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0007700000000000001,
+      "loss": 7.5185,
+      "step": 385
+    },
+    {
+      "epoch": 0.0510037271954489,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 7.4919,
+      "step": 390
+    },
+    {
+      "epoch": 0.05165762113385209,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00079,
+      "loss": 7.4876,
+      "step": 395
+    },
+    {
+      "epoch": 0.05231151507225528,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0008,
+      "loss": 7.4722,
+      "step": 400
+    },
+    {
+      "epoch": 0.05296540901065847,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0008100000000000001,
+      "loss": 7.498,
+      "step": 405
+    },
+    {
+      "epoch": 0.05361930294906166,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00082,
+      "loss": 7.4047,
+      "step": 410
+    },
+    {
+      "epoch": 0.054273196887464854,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00083,
+      "loss": 7.4376,
+      "step": 415
+    },
+    {
+      "epoch": 0.054927090825868045,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00084,
+      "loss": 7.4518,
+      "step": 420
+    },
+    {
+      "epoch": 0.055580984764271237,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00085,
+      "loss": 7.4957,
+      "step": 425
+    },
+    {
+      "epoch": 0.05623487870267443,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00086,
+      "loss": 7.4782,
+      "step": 430
+    },
+    {
+      "epoch": 0.05688877264107762,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00087,
+      "loss": 7.4289,
+      "step": 435
+    },
+    {
+      "epoch": 0.05754266657948081,
+      "grad_norm": 3.09375,
+      "learning_rate": 0.00088,
+      "loss": 7.4412,
+      "step": 440
+    },
+    {
+      "epoch": 0.058196560517884,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0008900000000000001,
+      "loss": 7.413,
+      "step": 445
+    },
+    {
+      "epoch": 0.05885045445628719,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 7.4372,
+      "step": 450
+    },
+    {
+      "epoch": 0.05950434839469038,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00091,
+      "loss": 7.3155,
+      "step": 455
+    },
+    {
+      "epoch": 0.060158242333093574,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00092,
+      "loss": 7.4253,
+      "step": 460
+    },
+    {
+      "epoch": 0.060812136271496765,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00093,
+      "loss": 7.3755,
+      "step": 465
+    },
+    {
+      "epoch": 0.061466030209899956,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00094,
+      "loss": 7.391,
+      "step": 470
+    },
+    {
+      "epoch": 0.06211992414830315,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00095,
+      "loss": 7.3909,
+      "step": 475
+    },
+    {
+      "epoch": 0.06277381808670633,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00096,
+      "loss": 7.3484,
+      "step": 480
+    },
+    {
+      "epoch": 0.06342771202510952,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0009699999999999999,
+      "loss": 7.3424,
+      "step": 485
+    },
+    {
+      "epoch": 0.06408160596351271,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.00098,
+      "loss": 7.3676,
+      "step": 490
+    },
+    {
+      "epoch": 0.0647354999019159,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00099,
+      "loss": 7.2811,
+      "step": 495
+    },
+    {
+      "epoch": 0.0653893938403191,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.001,
+      "loss": 7.2938,
+      "step": 500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 683343042969600.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/checkpoint-500/training_args.bin
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d844e9bd7269e969751057307f9b24ce1dc61943d4ef008f2ddc415e2d7a0e8
+size 6161