commit 184289a7e17492603d6636a2d2f50c072705de5f Author: ModelHub XC Date: Sat Jun 6 08:54:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: modrill/kodcode_3_qwen3_4b_sft Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..85e2bfb --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-659/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3c74695 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +--- +license: cc-by-nc-4.0 +tags: +- mhm +- text-generation +library_name: transformers +--- + +# kodcode_3_qwen3_4b_sft + +Auto-uploaded from local output (MergeBench and LlamaFactory excluded). + +- Source path: `trl/qwen3-4b-sft-kodcode-3` +- Type: `full` +- Uploaded at: `2026-05-20T06:49:11.293145` +- Visibility: `public` diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..89324cc --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,12 @@ +{%- for message in messages %} +{%- if message.role == "user" %} +{{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }} +{%- elif message.role == "system" %} +{{- '<|im_start|>system\n' + message.content + '<|im_end|>\n' }} +{%- elif message.role == "assistant" %} +{{- '<|im_start|>assistant\n' }}{% generation %}{{ message.content }}{% endgeneration %}{{ '<|im_end|>\n' }} +{%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} +{{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-659/chat_template.jinja b/checkpoint-659/chat_template.jinja new file mode 100644 index 0000000..89324cc --- /dev/null +++ b/checkpoint-659/chat_template.jinja @@ -0,0 +1,12 @@ +{%- for message in messages %} +{%- if message.role == "user" %} +{{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }} +{%- elif message.role == "system" %} +{{- '<|im_start|>system\n' + message.content + '<|im_end|>\n' }} +{%- elif message.role == "assistant" %} +{{- '<|im_start|>assistant\n' }}{% generation %}{{ message.content }}{% endgeneration %}{{ '<|im_end|>\n' }} +{%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} +{{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-659/config.json b/checkpoint-659/config.json new file mode 100644 index 0000000..18268ea --- /dev/null +++ b/checkpoint-659/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.8.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-659/generation_config.json b/checkpoint-659/generation_config.json new file mode 100644 index 0000000..dcd33a6 --- /dev/null +++ b/checkpoint-659/generation_config.json @@ -0,0 +1,10 @@ +{ + "do_sample": false, + "eos_token_id": [ + 151645, + 151643 + ], + "max_new_tokens": 2048, + "pad_token_id": 151643, + "transformers_version": "5.8.0" +} diff --git a/checkpoint-659/model.safetensors b/checkpoint-659/model.safetensors new file mode 100644 index 0000000..e654f28 --- /dev/null +++ b/checkpoint-659/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e18e9f555d63e04b2a5d67a10939ba633cf584772c20342c840bc3158f275c +size 8044982080 diff --git a/checkpoint-659/optimizer.pt b/checkpoint-659/optimizer.pt new file mode 100644 index 0000000..706deb8 --- /dev/null +++ b/checkpoint-659/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c2839680bde13ef34d634ec2c6bbbe6f84f6c1edb7b687385a7647accfd8ce +size 16090225449 diff --git a/checkpoint-659/rng_state_0.pth b/checkpoint-659/rng_state_0.pth new file mode 100644 index 0000000..2608234 --- /dev/null +++ b/checkpoint-659/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:659b1cdee2219458dd84ce6a632a595465680b8080e5c44bd600ff97eca8d752 +size 15429 diff --git a/checkpoint-659/rng_state_1.pth b/checkpoint-659/rng_state_1.pth new file mode 100644 index 0000000..d46ce04 --- /dev/null +++ b/checkpoint-659/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86accf27064cdd503053e90476a6bd10de333d4ff0594535ad55ea13a473c91d +size 15429 diff --git a/checkpoint-659/rng_state_2.pth b/checkpoint-659/rng_state_2.pth new file mode 100644 index 0000000..558429d --- /dev/null +++ b/checkpoint-659/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18ca8d714ef40be035404c1957b5a4dee84e1f43980408393f8aa710552ee6f6 +size 15429 diff --git a/checkpoint-659/rng_state_3.pth b/checkpoint-659/rng_state_3.pth new file mode 100644 index 0000000..4e54cbe --- /dev/null +++ b/checkpoint-659/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cfdebe99e40accc9c9d8f09c63136a14abda997d9b501969ec8e16e9d183179 +size 15429 diff --git a/checkpoint-659/scheduler.pt b/checkpoint-659/scheduler.pt new file mode 100644 index 0000000..31449d3 --- /dev/null +++ b/checkpoint-659/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:638f76f10b8122f6b6d00ef579bef156aea843a74d8ad66f5d19ea5b06be426f +size 1465 diff --git a/checkpoint-659/tokenizer.json b/checkpoint-659/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/checkpoint-659/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-659/tokenizer_config.json b/checkpoint-659/tokenizer_config.json new file mode 100644 index 0000000..770e41d --- /dev/null +++ b/checkpoint-659/tokenizer_config.json @@ -0,0 +1,30 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-659/trainer_state.json b/checkpoint-659/trainer_state.json new file mode 100644 index 0000000..7dd2092 --- /dev/null +++ b/checkpoint-659/trainer_state.json @@ -0,0 +1,684 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 659, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.3487173642963171, + "epoch": 0.015186028853454821, + "grad_norm": 1.609375, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.4099268913269043, + "mean_token_accuracy": 0.8717762351036071, + "num_tokens": 568708.0, + "step": 10 + }, + { + "entropy": 0.37818768359720706, + "epoch": 0.030372057706909643, + "grad_norm": 0.87890625, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.39949469566345214, + "mean_token_accuracy": 0.8737476468086243, + "num_tokens": 1125639.0, + "step": 20 + }, + { + "entropy": 0.40618006475269797, + "epoch": 0.04555808656036447, + "grad_norm": 0.70703125, + "learning_rate": 8.787878787878788e-06, + "loss": 0.3975033760070801, + "mean_token_accuracy": 0.8727035835385323, + "num_tokens": 1683225.0, + "step": 30 + }, + { + "entropy": 0.3896496780216694, + "epoch": 0.060744115413819286, + "grad_norm": 0.6953125, + "learning_rate": 9.997733473639876e-06, + "loss": 0.3925030708312988, + "mean_token_accuracy": 0.8742863699793816, + "num_tokens": 2236895.0, + "step": 40 + }, + { + "entropy": 0.37322904225438835, + "epoch": 0.07593014426727411, + "grad_norm": 0.68359375, + "learning_rate": 9.983889919973586e-06, + "loss": 0.3752753257751465, + "mean_token_accuracy": 0.8792506881058216, + "num_tokens": 2818707.0, + "step": 50 + }, + { + "entropy": 0.3811411205679178, + "epoch": 0.09111617312072894, + "grad_norm": 0.66796875, + "learning_rate": 9.957496810072027e-06, + "loss": 0.38604438304901123, + "mean_token_accuracy": 0.8750339619815349, + "num_tokens": 3351348.0, + "step": 60 + }, + { + "entropy": 0.3796327030286193, + "epoch": 0.10630220197418375, + "grad_norm": 0.66015625, + "learning_rate": 9.918620602428916e-06, + "loss": 0.37710745334625245, + "mean_token_accuracy": 0.8776259452104569, + "num_tokens": 3915545.0, + "step": 70 + }, + { + "entropy": 0.37812459245324137, + "epoch": 0.12148823082763857, + "grad_norm": 0.64453125, + "learning_rate": 9.867359188282193e-06, + "loss": 0.38009963035583494, + "mean_token_accuracy": 0.8783061921596527, + "num_tokens": 4462906.0, + "step": 80 + }, + { + "entropy": 0.3751340739428997, + "epoch": 0.1366742596810934, + "grad_norm": 0.6640625, + "learning_rate": 9.803841645121505e-06, + "loss": 0.37636594772338866, + "mean_token_accuracy": 0.8778362341225148, + "num_tokens": 5029003.0, + "step": 90 + }, + { + "entropy": 0.37551863975822924, + "epoch": 0.15186028853454822, + "grad_norm": 0.6640625, + "learning_rate": 9.728227911667934e-06, + "loss": 0.3773549795150757, + "mean_token_accuracy": 0.8772883579134941, + "num_tokens": 5596042.0, + "step": 100 + }, + { + "entropy": 0.3809764288365841, + "epoch": 0.16704631738800305, + "grad_norm": 0.71484375, + "learning_rate": 9.640708385144403e-06, + "loss": 0.3807323932647705, + "mean_token_accuracy": 0.8774459846317768, + "num_tokens": 6144821.0, + "step": 110 + }, + { + "entropy": 0.37467240951955316, + "epoch": 0.18223234624145787, + "grad_norm": 0.62890625, + "learning_rate": 9.541503441850844e-06, + "loss": 0.37542564868927003, + "mean_token_accuracy": 0.8782215595245362, + "num_tokens": 6691491.0, + "step": 120 + }, + { + "entropy": 0.3787713166326284, + "epoch": 0.19741837509491267, + "grad_norm": 0.7109375, + "learning_rate": 9.430862882251279e-06, + "loss": 0.37993783950805665, + "mean_token_accuracy": 0.8774278596043587, + "num_tokens": 7247335.0, + "step": 130 + }, + { + "entropy": 0.3862619888037443, + "epoch": 0.2126044039483675, + "grad_norm": 0.71484375, + "learning_rate": 9.309065301970193e-06, + "loss": 0.38727219104766847, + "mean_token_accuracy": 0.8749251998960972, + "num_tokens": 7808664.0, + "step": 140 + }, + { + "entropy": 0.3778634283691645, + "epoch": 0.22779043280182232, + "grad_norm": 0.71484375, + "learning_rate": 9.176417390281944e-06, + "loss": 0.38028583526611326, + "mean_token_accuracy": 0.8772468723356723, + "num_tokens": 8360893.0, + "step": 150 + }, + { + "entropy": 0.3749677825719118, + "epoch": 0.24297646165527714, + "grad_norm": 0.69921875, + "learning_rate": 9.033253157859715e-06, + "loss": 0.37344467639923096, + "mean_token_accuracy": 0.8786589197814465, + "num_tokens": 8905139.0, + "step": 160 + }, + { + "entropy": 0.37994367331266404, + "epoch": 0.25816249050873197, + "grad_norm": 0.69140625, + "learning_rate": 8.879933095728485e-06, + "loss": 0.38379650115966796, + "mean_token_accuracy": 0.8768095754086971, + "num_tokens": 9467791.0, + "step": 170 + }, + { + "entropy": 0.3774459037929773, + "epoch": 0.2733485193621868, + "grad_norm": 0.7109375, + "learning_rate": 8.716843267539868e-06, + "loss": 0.3767258644104004, + "mean_token_accuracy": 0.8779186218976974, + "num_tokens": 10013526.0, + "step": 180 + }, + { + "entropy": 0.3706828704103827, + "epoch": 0.2885345482156416, + "grad_norm": 0.6875, + "learning_rate": 8.544394337454409e-06, + "loss": 0.373125958442688, + "mean_token_accuracy": 0.8792334951460361, + "num_tokens": 10567209.0, + "step": 190 + }, + { + "entropy": 0.3781122103333473, + "epoch": 0.30372057706909644, + "grad_norm": 0.7109375, + "learning_rate": 8.36302053607924e-06, + "loss": 0.3779691457748413, + "mean_token_accuracy": 0.877835976332426, + "num_tokens": 11121802.0, + "step": 200 + }, + { + "entropy": 0.37852676026523113, + "epoch": 0.31890660592255127, + "grad_norm": 0.703125, + "learning_rate": 8.17317856706482e-06, + "loss": 0.37905910015106203, + "mean_token_accuracy": 0.877592646330595, + "num_tokens": 11677885.0, + "step": 210 + }, + { + "entropy": 0.376834512129426, + "epoch": 0.3340926347760061, + "grad_norm": 0.66015625, + "learning_rate": 7.975346457114034e-06, + "loss": 0.3753563404083252, + "mean_token_accuracy": 0.8776590585708618, + "num_tokens": 12235216.0, + "step": 220 + }, + { + "entropy": 0.3737819105386734, + "epoch": 0.3492786636294609, + "grad_norm": 0.671875, + "learning_rate": 7.770022352299294e-06, + "loss": 0.37358593940734863, + "mean_token_accuracy": 0.878921328485012, + "num_tokens": 12787759.0, + "step": 230 + }, + { + "entropy": 0.3769740372896194, + "epoch": 0.36446469248291574, + "grad_norm": 0.73046875, + "learning_rate": 7.557723263718596e-06, + "loss": 0.37995898723602295, + "mean_token_accuracy": 0.8769361607730388, + "num_tokens": 13346471.0, + "step": 240 + }, + { + "entropy": 0.3878506176173687, + "epoch": 0.37965072133637057, + "grad_norm": 0.70703125, + "learning_rate": 7.338983765648985e-06, + "loss": 0.38782215118408203, + "mean_token_accuracy": 0.8749015353620052, + "num_tokens": 13895194.0, + "step": 250 + }, + { + "entropy": 0.37555828876793385, + "epoch": 0.39483675018982534, + "grad_norm": 0.6875, + "learning_rate": 7.114354649475499e-06, + "loss": 0.3771331787109375, + "mean_token_accuracy": 0.878202386945486, + "num_tokens": 14453542.0, + "step": 260 + }, + { + "entropy": 0.37179951313883064, + "epoch": 0.41002277904328016, + "grad_norm": 0.6484375, + "learning_rate": 6.884401536785045e-06, + "loss": 0.37206058502197265, + "mean_token_accuracy": 0.8789021499454975, + "num_tokens": 15016280.0, + "step": 270 + }, + { + "entropy": 0.3706459369510412, + "epoch": 0.425208807896735, + "grad_norm": 0.7109375, + "learning_rate": 6.6497034551174585e-06, + "loss": 0.37101426124572756, + "mean_token_accuracy": 0.8798882246017456, + "num_tokens": 15561057.0, + "step": 280 + }, + { + "entropy": 0.37728526555001735, + "epoch": 0.4403948367501898, + "grad_norm": 0.65625, + "learning_rate": 6.41085137996006e-06, + "loss": 0.37785754203796384, + "mean_token_accuracy": 0.8779311388731003, + "num_tokens": 16127699.0, + "step": 290 + }, + { + "entropy": 0.3816283464431763, + "epoch": 0.45558086560364464, + "grad_norm": 0.81640625, + "learning_rate": 6.168446746656973e-06, + "loss": 0.3794879674911499, + "mean_token_accuracy": 0.8773063771426678, + "num_tokens": 16686457.0, + "step": 300 + }, + { + "entropy": 0.37517447732388975, + "epoch": 0.47076689445709946, + "grad_norm": 0.70703125, + "learning_rate": 5.923099935980278e-06, + "loss": 0.3782352924346924, + "mean_token_accuracy": 0.8787827685475349, + "num_tokens": 17254272.0, + "step": 310 + }, + { + "entropy": 0.374018133059144, + "epoch": 0.4859529233105543, + "grad_norm": 0.71484375, + "learning_rate": 5.675428737176367e-06, + "loss": 0.37341156005859377, + "mean_token_accuracy": 0.8788688823580741, + "num_tokens": 17809900.0, + "step": 320 + }, + { + "entropy": 0.3753270395100117, + "epoch": 0.5011389521640092, + "grad_norm": 0.68359375, + "learning_rate": 5.426056792357552e-06, + "loss": 0.3752497673034668, + "mean_token_accuracy": 0.8784179173409938, + "num_tokens": 18379566.0, + "step": 330 + }, + { + "entropy": 0.3742110010236502, + "epoch": 0.5163249810174639, + "grad_norm": 0.6875, + "learning_rate": 5.175612026156045e-06, + "loss": 0.3746063232421875, + "mean_token_accuracy": 0.8782069273293018, + "num_tokens": 18943281.0, + "step": 340 + }, + { + "entropy": 0.37444472052156924, + "epoch": 0.5315110098709187, + "grad_norm": 0.71484375, + "learning_rate": 4.924725064594448e-06, + "loss": 0.3729024171829224, + "mean_token_accuracy": 0.8787923693656922, + "num_tokens": 19488865.0, + "step": 350 + }, + { + "entropy": 0.3750518877059221, + "epoch": 0.5466970387243736, + "grad_norm": 0.78515625, + "learning_rate": 4.674027647154037e-06, + "loss": 0.3758077621459961, + "mean_token_accuracy": 0.8765743866562843, + "num_tokens": 20048281.0, + "step": 360 + }, + { + "entropy": 0.3787516813725233, + "epoch": 0.5618830675778284, + "grad_norm": 0.74609375, + "learning_rate": 4.424151036039381e-06, + "loss": 0.3790909767150879, + "mean_token_accuracy": 0.8769759923219681, + "num_tokens": 20597434.0, + "step": 370 + }, + { + "entropy": 0.3786877432838082, + "epoch": 0.5770690964312832, + "grad_norm": 0.68359375, + "learning_rate": 4.175724426644724e-06, + "loss": 0.3812232971191406, + "mean_token_accuracy": 0.8777030549943448, + "num_tokens": 21161267.0, + "step": 380 + }, + { + "entropy": 0.37311795353889465, + "epoch": 0.592255125284738, + "grad_norm": 0.6796875, + "learning_rate": 3.929373363224654e-06, + "loss": 0.3731100559234619, + "mean_token_accuracy": 0.8793233536183834, + "num_tokens": 21709421.0, + "step": 390 + }, + { + "entropy": 0.3734915753826499, + "epoch": 0.6074411541381929, + "grad_norm": 0.6875, + "learning_rate": 3.685718163758427e-06, + "loss": 0.37124335765838623, + "mean_token_accuracy": 0.8786324210464954, + "num_tokens": 22250023.0, + "step": 400 + }, + { + "entropy": 0.3722789943218231, + "epoch": 0.6226271829916477, + "grad_norm": 0.67578125, + "learning_rate": 3.445372357974194e-06, + "loss": 0.37429609298706057, + "mean_token_accuracy": 0.8784996062517166, + "num_tokens": 22802881.0, + "step": 410 + }, + { + "entropy": 0.3826644644141197, + "epoch": 0.6378132118451025, + "grad_norm": 0.65625, + "learning_rate": 3.2089411424661864e-06, + "loss": 0.3828511953353882, + "mean_token_accuracy": 0.875868634134531, + "num_tokens": 23368508.0, + "step": 420 + }, + { + "entropy": 0.36304581388831136, + "epoch": 0.6529992406985573, + "grad_norm": 0.703125, + "learning_rate": 2.977019856794955e-06, + "loss": 0.362534499168396, + "mean_token_accuracy": 0.8821237675845623, + "num_tokens": 23923709.0, + "step": 430 + }, + { + "entropy": 0.38759873658418653, + "epoch": 0.6681852695520122, + "grad_norm": 0.67578125, + "learning_rate": 2.7501924844078538e-06, + "loss": 0.38718571662902834, + "mean_token_accuracy": 0.8746685221791267, + "num_tokens": 24477925.0, + "step": 440 + }, + { + "entropy": 0.3708066754043102, + "epoch": 0.683371298405467, + "grad_norm": 0.6875, + "learning_rate": 2.5290301821544826e-06, + "loss": 0.36970815658569334, + "mean_token_accuracy": 0.8802599847316742, + "num_tokens": 25027245.0, + "step": 450 + }, + { + "entropy": 0.36688214987516404, + "epoch": 0.6985573272589218, + "grad_norm": 0.68359375, + "learning_rate": 2.3140898420998425e-06, + "loss": 0.3657586097717285, + "mean_token_accuracy": 0.8809232294559479, + "num_tokens": 25582534.0, + "step": 460 + }, + { + "entropy": 0.36983687337487936, + "epoch": 0.7137433561123766, + "grad_norm": 0.7109375, + "learning_rate": 2.105912689256533e-06, + "loss": 0.37239837646484375, + "mean_token_accuracy": 0.8794391065835953, + "num_tokens": 26134875.0, + "step": 470 + }, + { + "entropy": 0.3732773784548044, + "epoch": 0.7289293849658315, + "grad_norm": 0.63671875, + "learning_rate": 1.905022918766995e-06, + "loss": 0.37306258678436277, + "mean_token_accuracy": 0.8793606124818325, + "num_tokens": 26681084.0, + "step": 480 + }, + { + "entropy": 0.3710829207673669, + "epoch": 0.7441154138192863, + "grad_norm": 0.69921875, + "learning_rate": 1.7119263759673677e-06, + "loss": 0.3711911678314209, + "mean_token_accuracy": 0.8803343921899796, + "num_tokens": 27234332.0, + "step": 490 + }, + { + "entropy": 0.3832434043288231, + "epoch": 0.7593014426727411, + "grad_norm": 0.9921875, + "learning_rate": 1.5271092826566108e-06, + "loss": 0.3841698169708252, + "mean_token_accuracy": 0.8759766638278961, + "num_tokens": 27794602.0, + "step": 500 + }, + { + "entropy": 0.38211295008659363, + "epoch": 0.7744874715261959, + "grad_norm": 0.8046875, + "learning_rate": 1.3510370127781635e-06, + "loss": 0.3804590940475464, + "mean_token_accuracy": 0.8769169762730599, + "num_tokens": 28354831.0, + "step": 510 + }, + { + "entropy": 0.3752284612506628, + "epoch": 0.7896735003796507, + "grad_norm": 0.73046875, + "learning_rate": 1.1841529205970281e-06, + "loss": 0.37546916007995607, + "mean_token_accuracy": 0.8786770381033421, + "num_tokens": 28922852.0, + "step": 520 + }, + { + "entropy": 0.36873019095510245, + "epoch": 0.8048595292331056, + "grad_norm": 0.6640625, + "learning_rate": 1.026877224322923e-06, + "loss": 0.36797375679016114, + "mean_token_accuracy": 0.880731363594532, + "num_tokens": 29493442.0, + "step": 530 + }, + { + "entropy": 0.3788988694548607, + "epoch": 0.8200455580865603, + "grad_norm": 0.69921875, + "learning_rate": 8.7960594799059e-07, + "loss": 0.37884984016418455, + "mean_token_accuracy": 0.8770281121134758, + "num_tokens": 30034443.0, + "step": 540 + }, + { + "entropy": 0.3814096964895725, + "epoch": 0.8352315869400152, + "grad_norm": 0.73046875, + "learning_rate": 7.427099242616348e-07, + "loss": 0.3821078300476074, + "mean_token_accuracy": 0.8763406798243523, + "num_tokens": 30570567.0, + "step": 550 + }, + { + "entropy": 0.3758743409067392, + "epoch": 0.85041761579347, + "grad_norm": 0.6953125, + "learning_rate": 6.165338606588517e-07, + "loss": 0.3744307279586792, + "mean_token_accuracy": 0.8794760994613171, + "num_tokens": 31129457.0, + "step": 560 + }, + { + "entropy": 0.37431446108967065, + "epoch": 0.8656036446469249, + "grad_norm": 0.6640625, + "learning_rate": 5.0139547158427e-07, + "loss": 0.37335963249206544, + "mean_token_accuracy": 0.8793582506477833, + "num_tokens": 31689902.0, + "step": 570 + }, + { + "entropy": 0.38372623883187773, + "epoch": 0.8807896735003796, + "grad_norm": 0.7578125, + "learning_rate": 3.9758467830656623e-07, + "loss": 0.38321547508239745, + "mean_token_accuracy": 0.8755873307585716, + "num_tokens": 32253359.0, + "step": 580 + }, + { + "entropy": 0.36962624490261076, + "epoch": 0.8959757023538345, + "grad_norm": 0.72265625, + "learning_rate": 3.0536287893223603e-07, + "loss": 0.37100839614868164, + "mean_token_accuracy": 0.8799250744283199, + "num_tokens": 32813428.0, + "step": 590 + }, + { + "entropy": 0.3853254303336143, + "epoch": 0.9111617312072893, + "grad_norm": 0.6640625, + "learning_rate": 2.2496229019879635e-07, + "loss": 0.3848439693450928, + "mean_token_accuracy": 0.8762004837393761, + "num_tokens": 33382024.0, + "step": 600 + }, + { + "entropy": 0.3742272950708866, + "epoch": 0.9263477600607442, + "grad_norm": 0.671875, + "learning_rate": 1.5658536274738623e-07, + "loss": 0.3725078582763672, + "mean_token_accuracy": 0.8787065915763378, + "num_tokens": 33939521.0, + "step": 610 + }, + { + "entropy": 0.3774934906512499, + "epoch": 0.9415337889141989, + "grad_norm": 0.69140625, + "learning_rate": 1.004042713471165e-07, + "loss": 0.37858588695526124, + "mean_token_accuracy": 0.877676124125719, + "num_tokens": 34482539.0, + "step": 620 + }, + { + "entropy": 0.37360552567988636, + "epoch": 0.9567198177676538, + "grad_norm": 0.734375, + "learning_rate": 5.6560481354807625e-08, + "loss": 0.37269864082336424, + "mean_token_accuracy": 0.8786865592002868, + "num_tokens": 35045028.0, + "step": 630 + }, + { + "entropy": 0.3767994062975049, + "epoch": 0.9719058466211086, + "grad_norm": 0.890625, + "learning_rate": 2.516439250177749e-08, + "loss": 0.3758098125457764, + "mean_token_accuracy": 0.8778494797647, + "num_tokens": 35600399.0, + "step": 640 + }, + { + "entropy": 0.3786265593022108, + "epoch": 0.9870918754745635, + "grad_norm": 0.6875, + "learning_rate": 6.295060904623618e-09, + "loss": 0.378217077255249, + "mean_token_accuracy": 0.8778206452727317, + "num_tokens": 36162293.0, + "step": 650 + } + ], + "logging_steps": 10, + "max_steps": 659, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.858614662906511e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-659/training_args.bin b/checkpoint-659/training_args.bin new file mode 100644 index 0000000..5dbf4c3 --- /dev/null +++ b/checkpoint-659/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:514f32c7b13687591d405f4e860f5e4d9145eaaff00bbbfd04aded17ecc9774d +size 5777 diff --git a/config.json b/config.json new file mode 100644 index 0000000..18268ea --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.8.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..dcd33a6 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "do_sample": false, + "eos_token_id": [ + 151645, + 151643 + ], + "max_new_tokens": 2048, + "pad_token_id": 151643, + "transformers_version": "5.8.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..e654f28 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e18e9f555d63e04b2a5d67a10939ba633cf584772c20342c840bc3158f275c +size 8044982080 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..b02d0a8 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,30 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": { + "im_start": "<|im_start|>", + "im_end": "<|im_end|>", + "object_ref_start": "<|object_ref_start|>", + "object_ref_end": "<|object_ref_end|>", + "box_start": "<|box_start|>", + "box_end": "<|box_end|>", + "quad_start": "<|quad_start|>", + "quad_end": "<|quad_end|>", + "vision_start": "<|vision_start|>", + "vision_end": "<|vision_end|>", + "vision_pad": "<|vision_pad|>", + "image_pad": "<|image_pad|>", + "video_pad": "<|video_pad|>" + }, + "is_local": false, + "local_files_only": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..5dbf4c3 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:514f32c7b13687591d405f4e860f5e4d9145eaaff00bbbfd04aded17ecc9774d +size 5777