commit 8eae555d0b5f69e74071aa098213b2bac623975c Author: ModelHub XC Date: Sun May 24 01:55:15 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: BEE-spoke-data/Mixtral-GQA-400m-v2 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..71781c1 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +--- +license: apache-2.0 +language: +- en +--- + + + +# BEE-spoke-data/Mixtral-GQA-400m-v2 + + + + +## testing code + +```python +# !pip install -U -q transformers datasets accelerate sentencepiece +import pprint as pp +from transformers import pipeline + +pipe = pipeline( + "text-generation", + model="BEE-spoke-data/Mixtral-GQA-400m-v2", + device_map="auto", +) +pipe.model.config.pad_token_id = pipe.model.config.eos_token_id + +prompt = "My favorite movie is Godfather because" + +res = pipe( + prompt, + max_new_tokens=256, + top_k=4, + penalty_alpha=0.6, + use_cache=True, + no_repeat_ngram_size=4, + repetition_penalty=1.1, + renormalize_logits=True, +) +pp.pprint(res[0]) +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..a33590d --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "/workspace/axolotl/mixtral-smol-400-v2", + "architectures": [ + "MixtralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 4096, + "model_type": "mixtral", + "num_attention_heads": 32, + "num_experts_per_tok": 2, + "num_hidden_layers": 6, + "num_key_value_heads": 8, + "num_local_experts": 8, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "router_aux_loss_coef": 0.02, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.37.0.dev0", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..59cf3fd --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.37.0.dev0", + "use_cache": false +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..07af250 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed8d25c13b5844e479e79e2bb6c1575e37cd14c850acb4fef3f0e64f666333c +size 4274322736 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..72ecfee --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..8b443ef --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055 +size 493443 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3930c93 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [], + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "trust_remote_code": true, + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..b5a7ba3 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1853 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.36951013513513514, + "eval_steps": 406, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "eval_loss": 10.780741691589355, + "eval_runtime": 947.394, + "eval_samples_per_second": 79.833, + "eval_steps_per_second": 4.435, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.25e-05, + "loss": 10.7434, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 4.5e-05, + "loss": 9.9589, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 6.75e-05, + "loss": 8.6595, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 9e-05, + "loss": 7.9676, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001125, + "loss": 7.4358, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 0.000135, + "loss": 7.1452, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015749999999999998, + "loss": 6.9396, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 0.00018, + "loss": 6.8089, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002025, + "loss": 6.6897, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 0.000225, + "loss": 6.4553, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 0.00022499913644577618, + "loss": 6.3166, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002249965457963621, + "loss": 6.1718, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022499222809152964, + "loss": 6.0497, + "step": 65 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022498618339756446, + "loss": 5.9306, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022497841180726518, + "loss": 5.825, + "step": 75 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022496891343994188, + "loss": 5.7394, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022495768844141414, + "loss": 5.6247, + "step": 85 + }, + { + "epoch": 0.02, + "learning_rate": 0.000224944736984009, + "loss": 5.5139, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022493005926655827, + "loss": 5.3914, + "step": 95 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002249136555143953, + "loss": 5.369, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022489552597935173, + "loss": 5.3003, + "step": 105 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022487567093975358, + "loss": 5.2256, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022485409070041688, + "loss": 5.141, + "step": 115 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022483078559264308, + "loss": 5.0825, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022480575597421393, + "loss": 5.0306, + "step": 125 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002247790022293861, + "loss": 4.9651, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 0.000224750524768885, + "loss": 4.9608, + "step": 135 + }, + { + "epoch": 0.03, + "learning_rate": 0.00022472032402989878, + "loss": 4.9164, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.00022468840047607143, + "loss": 4.8746, + "step": 145 + }, + { + "epoch": 0.04, + "learning_rate": 0.00022465475459749576, + "loss": 4.7804, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 0.00022461938691070582, + "loss": 4.7936, + "step": 155 + }, + { + "epoch": 0.04, + "learning_rate": 0.000224582297958669, + "loss": 4.7272, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00022454348831077767, + "loss": 4.7058, + "step": 165 + }, + { + "epoch": 0.04, + "learning_rate": 0.00022450295856284047, + "loss": 4.6486, + "step": 170 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002244607093370731, + "loss": 4.6536, + "step": 175 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002244167412820889, + "loss": 4.5721, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022437105507288872, + "loss": 4.5511, + "step": 185 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022432365141085068, + "loss": 4.4932, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022427453102371933, + "loss": 4.5251, + "step": 195 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002242236946655946, + "loss": 4.4763, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022417114311692, + "loss": 4.5116, + "step": 205 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022411687718447093, + "loss": 4.4102, + "step": 210 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022406089770134205, + "loss": 4.4343, + "step": 215 + }, + { + "epoch": 0.05, + "learning_rate": 0.00022400320552693452, + "loss": 4.4403, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.000223943801546943, + "loss": 4.4068, + "step": 225 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002238826866733418, + "loss": 4.3988, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022381986184437112, + "loss": 4.3703, + "step": 235 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022375532802452238, + "loss": 4.303, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022368908620452367, + "loss": 4.2979, + "step": 245 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022362113740132436, + "loss": 4.2974, + "step": 250 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022355148265807966, + "loss": 4.2652, + "step": 255 + }, + { + "epoch": 0.06, + "learning_rate": 0.00022348012304413426, + "loss": 4.203, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022340705965500642, + "loss": 4.2045, + "step": 265 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022333229361237082, + "loss": 4.2342, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022325582606404126, + "loss": 4.1869, + "step": 275 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022317765818395332, + "loss": 4.0939, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022309779117214617, + "loss": 4.1401, + "step": 285 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022301622625474417, + "loss": 4.0744, + "step": 290 + }, + { + "epoch": 0.07, + "learning_rate": 0.00022293296468393808, + "loss": 4.0818, + "step": 295 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002228480077379657, + "loss": 4.029, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022276135672109258, + "loss": 4.0574, + "step": 305 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022267301296359155, + "loss": 4.0003, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022258297782172258, + "loss": 4.0143, + "step": 315 + }, + { + "epoch": 0.08, + "learning_rate": 0.000222491252677712, + "loss": 3.9962, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002223978389397311, + "loss": 3.9994, + "step": 325 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022230273804187456, + "loss": 3.9424, + "step": 330 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022220595144413854, + "loss": 3.9215, + "step": 335 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022210748063239815, + "loss": 3.9483, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 0.00022200732711838466, + "loss": 3.9037, + "step": 345 + }, + { + "epoch": 0.09, + "learning_rate": 0.00022190549243966234, + "loss": 3.8959, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002218019781596049, + "loss": 3.8066, + "step": 355 + }, + { + "epoch": 0.09, + "learning_rate": 0.00022169678586737127, + "loss": 3.8306, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00022158991717788137, + "loss": 3.7961, + "step": 365 + }, + { + "epoch": 0.09, + "learning_rate": 0.00022148137373179146, + "loss": 3.7739, + "step": 370 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002213711571954686, + "loss": 3.7727, + "step": 375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00022125926926096538, + "loss": 3.7895, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002211457116459937, + "loss": 3.7839, + "step": 385 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022103048609389868, + "loss": 3.7261, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022091359437363157, + "loss": 3.7129, + "step": 395 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022079503827972293, + "loss": 3.6765, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002206748196322547, + "loss": 3.6463, + "step": 405 + }, + { + "epoch": 0.1, + "eval_loss": 3.6351399421691895, + "eval_runtime": 955.1583, + "eval_samples_per_second": 79.184, + "eval_steps_per_second": 4.399, + "step": 406 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022055294027683266, + "loss": 3.6138, + "step": 410 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002204294020845578, + "loss": 3.6069, + "step": 415 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022030420695199774, + "loss": 3.5781, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 0.00022017735680115755, + "loss": 3.5925, + "step": 425 + }, + { + "epoch": 0.11, + "learning_rate": 0.00022004885357945026, + "loss": 3.6038, + "step": 430 + }, + { + "epoch": 0.11, + "learning_rate": 0.000219918699259667, + "loss": 3.5368, + "step": 435 + }, + { + "epoch": 0.11, + "learning_rate": 0.00021978689583994666, + "loss": 3.5982, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00021965344534374522, + "loss": 3.5501, + "step": 445 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002195183498198047, + "loss": 3.5315, + "step": 450 + }, + { + "epoch": 0.11, + "learning_rate": 0.00021938161134212177, + "loss": 3.5229, + "step": 455 + }, + { + "epoch": 0.11, + "learning_rate": 0.00021924323200991577, + "loss": 3.5106, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 0.00021910321394759662, + "loss": 3.4851, + "step": 465 + }, + { + "epoch": 0.12, + "learning_rate": 0.00021896155930473216, + "loss": 3.4405, + "step": 470 + }, + { + "epoch": 0.12, + "learning_rate": 0.00021881827025601504, + "loss": 3.5036, + "step": 475 + }, + { + "epoch": 0.12, + "learning_rate": 0.00021867334900122954, + "loss": 3.4158, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002185267977652176, + "loss": 3.465, + "step": 485 + }, + { + "epoch": 0.12, + "learning_rate": 0.00021837861879784484, + "loss": 3.3843, + "step": 490 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002182288143739659, + "loss": 3.3678, + "step": 495 + }, + { + "epoch": 0.12, + "learning_rate": 0.00021807738679338953, + "loss": 3.4079, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002179243383808433, + "loss": 3.3826, + "step": 505 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021776967148593793, + "loss": 3.4016, + "step": 510 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021761338848313123, + "loss": 3.3715, + "step": 515 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002174554917716916, + "loss": 3.4046, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021729598377566122, + "loss": 3.3304, + "step": 525 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021713486694381875, + "loss": 3.3419, + "step": 530 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021697214374964195, + "loss": 3.3681, + "step": 535 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002168078166912695, + "loss": 3.3012, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 0.00021664188829146277, + "loss": 3.3551, + "step": 545 + }, + { + "epoch": 0.14, + "learning_rate": 0.000216474361097567, + "loss": 3.2824, + "step": 550 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021630523768147218, + "loss": 3.3024, + "step": 555 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021613452063957379, + "loss": 3.2661, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021596221259273266, + "loss": 3.2882, + "step": 565 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002157883161862348, + "loss": 3.3181, + "step": 570 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021561283408975097, + "loss": 3.2574, + "step": 575 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021543576899729543, + "loss": 3.2339, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 0.00021525712362718483, + "loss": 3.2554, + "step": 585 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021507690072199625, + "loss": 3.1892, + "step": 590 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021489510304852536, + "loss": 3.2946, + "step": 595 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021471173339774363, + "loss": 3.2303, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021452679458475567, + "loss": 3.2891, + "step": 605 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021434028944875607, + "loss": 3.224, + "step": 610 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021415222085298573, + "loss": 3.2337, + "step": 615 + }, + { + "epoch": 0.15, + "learning_rate": 0.00021396259168468773, + "loss": 3.2158, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002137714048550634, + "loss": 3.2126, + "step": 625 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002135786632992273, + "loss": 3.1934, + "step": 630 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021338436997616223, + "loss": 3.2051, + "step": 635 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021318852786867388, + "loss": 3.2667, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021299113998334503, + "loss": 3.1956, + "step": 645 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021279220935048926, + "loss": 3.1771, + "step": 650 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002125917390241046, + "loss": 3.1467, + "step": 655 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021238973208182659, + "loss": 3.1788, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 0.00021218619162488095, + "loss": 3.1967, + "step": 665 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021198112077803607, + "loss": 3.149, + "step": 670 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021177452268955496, + "loss": 3.154, + "step": 675 + }, + { + "epoch": 0.17, + "learning_rate": 0.000211566400531147, + "loss": 3.1652, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021135675749791924, + "loss": 3.1433, + "step": 685 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021114559680832722, + "loss": 3.1893, + "step": 690 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002109329217041257, + "loss": 3.1788, + "step": 695 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021071873545031885, + "loss": 3.1549, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021050304133511018, + "loss": 3.1294, + "step": 705 + }, + { + "epoch": 0.17, + "learning_rate": 0.00021028584266985186, + "loss": 3.1109, + "step": 710 + }, + { + "epoch": 0.18, + "learning_rate": 0.00021006714278899415, + "loss": 3.1713, + "step": 715 + }, + { + "epoch": 0.18, + "learning_rate": 0.00020984694505003402, + "loss": 3.1304, + "step": 720 + }, + { + "epoch": 0.18, + "learning_rate": 0.00020962525283346376, + "loss": 3.1285, + "step": 725 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002094020695427188, + "loss": 3.0582, + "step": 730 + }, + { + "epoch": 0.18, + "learning_rate": 0.00020917739860412592, + "loss": 3.1063, + "step": 735 + }, + { + "epoch": 0.18, + "learning_rate": 0.00020895124346685017, + "loss": 3.1309, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 0.00020872360760284219, + "loss": 3.1125, + "step": 745 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002084944945067849, + "loss": 3.1108, + "step": 750 + }, + { + "epoch": 0.19, + "learning_rate": 0.00020826390769603968, + "loss": 3.0765, + "step": 755 + }, + { + "epoch": 0.19, + "learning_rate": 0.00020803185071059267, + "loss": 3.0634, + "step": 760 + }, + { + "epoch": 0.19, + "learning_rate": 0.000207798327113, + "loss": 3.0993, + "step": 765 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002075633404883336, + "loss": 3.1127, + "step": 770 + }, + { + "epoch": 0.19, + "learning_rate": 0.00020732689444412573, + "loss": 3.0502, + "step": 775 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002070889926103138, + "loss": 3.0436, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002068496386391846, + "loss": 3.0305, + "step": 785 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002066088362053184, + "loss": 3.0996, + "step": 790 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020636658900553213, + "loss": 3.0584, + "step": 795 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020612290075882296, + "loss": 3.0508, + "step": 800 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020587777520631126, + "loss": 3.0578, + "step": 805 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020563121611118286, + "loss": 3.0308, + "step": 810 + }, + { + "epoch": 0.2, + "eval_loss": 3.0524611473083496, + "eval_runtime": 955.714, + "eval_samples_per_second": 79.138, + "eval_steps_per_second": 4.397, + "step": 812 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020538322725863146, + "loss": 3.122, + "step": 815 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020513381245580064, + "loss": 2.9886, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020488297553172515, + "loss": 3.0606, + "step": 825 + }, + { + "epoch": 0.2, + "learning_rate": 0.00020463072033727225, + "loss": 2.993, + "step": 830 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020437705074508264, + "loss": 2.9999, + "step": 835 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020412197064951097, + "loss": 3.0143, + "step": 840 + }, + { + "epoch": 0.21, + "learning_rate": 0.000203865483966566, + "loss": 2.9886, + "step": 845 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020360759463385053, + "loss": 3.0219, + "step": 850 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020334830661050102, + "loss": 2.9888, + "step": 855 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020308762387712662, + "loss": 3.0271, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020282555043574823, + "loss": 3.0063, + "step": 865 + }, + { + "epoch": 0.21, + "learning_rate": 0.00020256209030973708, + "loss": 3.0198, + "step": 870 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020229724754375266, + "loss": 3.0135, + "step": 875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020203102620368113, + "loss": 3.0008, + "step": 880 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020176343037657242, + "loss": 3.0168, + "step": 885 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020149446417057782, + "loss": 3.042, + "step": 890 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020122413171488667, + "loss": 2.9954, + "step": 895 + }, + { + "epoch": 0.22, + "learning_rate": 0.00020095243715966316, + "loss": 3.0191, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002006793846759825, + "loss": 3.0343, + "step": 905 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002004049784557669, + "loss": 2.9843, + "step": 910 + }, + { + "epoch": 0.23, + "learning_rate": 0.00020012922271172128, + "loss": 3.0263, + "step": 915 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019985212167726853, + "loss": 2.975, + "step": 920 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001995736796064845, + "loss": 2.9858, + "step": 925 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001992939007740328, + "loss": 2.9526, + "step": 930 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001990127894750991, + "loss": 2.971, + "step": 935 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019873035002532512, + "loss": 2.9635, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019844658676074255, + "loss": 2.9807, + "step": 945 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001981615040377063, + "loss": 2.9822, + "step": 950 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019787510623282776, + "loss": 2.9552, + "step": 955 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019758739774290753, + "loss": 2.9877, + "step": 960 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019729838298486793, + "loss": 2.974, + "step": 965 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019700806639568524, + "loss": 2.9613, + "step": 970 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019671645243232155, + "loss": 2.9949, + "step": 975 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019642354557165633, + "loss": 2.9876, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019612935031041768, + "loss": 2.9644, + "step": 985 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019583387116511335, + "loss": 2.9204, + "step": 990 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019553711267196136, + "loss": 2.9849, + "step": 995 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019523907938682038, + "loss": 2.9622, + "step": 1000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019493977588511978, + "loss": 2.9457, + "step": 1005 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001946392067617894, + "loss": 2.9558, + "step": 1010 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019433737663118898, + "loss": 2.9107, + "step": 1015 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001940342901270374, + "loss": 2.963, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019372995190234136, + "loss": 2.8945, + "step": 1025 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019342436662932416, + "loss": 2.9756, + "step": 1030 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019311753899935389, + "loss": 2.9392, + "step": 1035 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019280947372287132, + "loss": 2.9293, + "step": 1040 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019250017552931774, + "loss": 2.947, + "step": 1045 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019218964916706223, + "loss": 2.9317, + "step": 1050 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019187789940332882, + "loss": 2.8816, + "step": 1055 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001915649310241233, + "loss": 2.9644, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001912507488341597, + "loss": 2.9219, + "step": 1065 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019093535765678663, + "loss": 2.8983, + "step": 1070 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019061876233391313, + "loss": 2.8816, + "step": 1075 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019030096772593432, + "loss": 2.8844, + "step": 1080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018998197871165692, + "loss": 2.9207, + "step": 1085 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018966180018822423, + "loss": 2.9553, + "step": 1090 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018934043707104098, + "loss": 2.893, + "step": 1095 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001890178942936979, + "loss": 2.9049, + "step": 1100 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018869417680789587, + "loss": 2.8872, + "step": 1105 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018836928958337009, + "loss": 2.9219, + "step": 1110 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018804323760781362, + "loss": 2.9005, + "step": 1115 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018771602588680083, + "loss": 2.8814, + "step": 1120 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018738765944371067, + "loss": 2.8915, + "step": 1125 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018705814331964945, + "loss": 2.8777, + "step": 1130 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001867274825733734, + "loss": 2.8821, + "step": 1135 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001863956822812112, + "loss": 2.8803, + "step": 1140 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018606274753698576, + "loss": 2.8787, + "step": 1145 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018572868345193632, + "loss": 2.9365, + "step": 1150 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001853934951546398, + "loss": 2.8642, + "step": 1155 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018505718779093206, + "loss": 2.8333, + "step": 1160 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001847197665238291, + "loss": 2.9019, + "step": 1165 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018438123653344746, + "loss": 2.8726, + "step": 1170 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018404160301692504, + "loss": 2.8655, + "step": 1175 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018370087118834102, + "loss": 2.8639, + "step": 1180 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018335904627863605, + "loss": 2.8742, + "step": 1185 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018301613353553182, + "loss": 2.9013, + "step": 1190 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001826721382234505, + "loss": 2.9131, + "step": 1195 + }, + { + "epoch": 0.3, + "learning_rate": 0.000182327065623434, + "loss": 2.9068, + "step": 1200 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001819809210330627, + "loss": 2.9222, + "step": 1205 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001816337097663744, + "loss": 2.8718, + "step": 1210 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018128543715378252, + "loss": 2.8504, + "step": 1215 + }, + { + "epoch": 0.3, + "eval_loss": 2.8658618927001953, + "eval_runtime": 957.687, + "eval_samples_per_second": 78.975, + "eval_steps_per_second": 4.388, + "step": 1218 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018093610854199438, + "loss": 2.8823, + "step": 1220 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018058572929392902, + "loss": 2.8528, + "step": 1225 + }, + { + "epoch": 0.3, + "learning_rate": 0.000180234304788635, + "loss": 2.9415, + "step": 1230 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001798818404212077, + "loss": 2.8599, + "step": 1235 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017952834160270655, + "loss": 2.8218, + "step": 1240 + }, + { + "epoch": 0.31, + "learning_rate": 0.000179173813760072, + "loss": 2.8258, + "step": 1245 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017881826233604204, + "loss": 2.8397, + "step": 1250 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017846169278906888, + "loss": 2.8817, + "step": 1255 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017810411059323498, + "loss": 2.8424, + "step": 1260 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017774552123816904, + "loss": 2.845, + "step": 1265 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017738593022896177, + "loss": 2.8272, + "step": 1270 + }, + { + "epoch": 0.31, + "learning_rate": 0.00017702534308608133, + "loss": 2.8452, + "step": 1275 + }, + { + "epoch": 0.32, + "learning_rate": 0.00017666376534528866, + "loss": 2.8805, + "step": 1280 + }, + { + "epoch": 0.32, + "learning_rate": 0.00017630120255755235, + "loss": 2.8824, + "step": 1285 + }, + { + "epoch": 0.32, + "learning_rate": 0.00017593766028896357, + "loss": 2.8669, + "step": 1290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001755731441206505, + "loss": 2.8883, + "step": 1295 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001752076596486927, + "loss": 2.813, + "step": 1300 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001748412124840353, + "loss": 2.8275, + "step": 1305 + }, + { + "epoch": 0.32, + "learning_rate": 0.00017447380825240264, + "loss": 2.8017, + "step": 1310 + }, + { + "epoch": 0.32, + "learning_rate": 0.00017410545259421208, + "loss": 2.8368, + "step": 1315 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017373615116448736, + "loss": 2.8448, + "step": 1320 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017336590963277173, + "loss": 2.8353, + "step": 1325 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017299473368304102, + "loss": 2.8326, + "step": 1330 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017262262901361627, + "loss": 2.7981, + "step": 1335 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017224960133707627, + "loss": 2.828, + "step": 1340 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017187565638017, + "loss": 2.8209, + "step": 1345 + }, + { + "epoch": 0.33, + "learning_rate": 0.00017150079988372842, + "loss": 2.8166, + "step": 1350 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001711250376025767, + "loss": 2.7783, + "step": 1355 + }, + { + "epoch": 0.34, + "learning_rate": 0.00017074837530544557, + "loss": 2.7564, + "step": 1360 + }, + { + "epoch": 0.34, + "learning_rate": 0.00017037081877488284, + "loss": 2.7957, + "step": 1365 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001699923738071648, + "loss": 2.8139, + "step": 1370 + }, + { + "epoch": 0.34, + "learning_rate": 0.00016961304621220696, + "loss": 2.7938, + "step": 1375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00016923284181347506, + "loss": 2.8097, + "step": 1380 + }, + { + "epoch": 0.34, + "learning_rate": 0.00016885176644789557, + "loss": 2.8043, + "step": 1385 + }, + { + "epoch": 0.34, + "learning_rate": 0.00016846982596576614, + "loss": 2.7577, + "step": 1390 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001680870262306657, + "loss": 2.7921, + "step": 1395 + }, + { + "epoch": 0.34, + "learning_rate": 0.00016770337311936456, + "loss": 2.7836, + "step": 1400 + }, + { + "epoch": 0.35, + "learning_rate": 0.00016731887252173408, + "loss": 2.7963, + "step": 1405 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001669335303406563, + "loss": 2.8547, + "step": 1410 + }, + { + "epoch": 0.35, + "learning_rate": 0.00016654735249193334, + "loss": 2.808, + "step": 1415 + }, + { + "epoch": 0.35, + "learning_rate": 0.00016616034490419648, + "loss": 2.7782, + "step": 1420 + }, + { + "epoch": 0.35, + "learning_rate": 0.00016577251351881532, + "loss": 2.76, + "step": 1425 + }, + { + "epoch": 0.35, + "learning_rate": 0.00016538386428980638, + "loss": 2.7957, + "step": 1430 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001649944031837418, + "loss": 2.819, + "step": 1435 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001646041361796578, + "loss": 2.7574, + "step": 1440 + }, + { + "epoch": 0.36, + "learning_rate": 0.00016421306926896266, + "loss": 2.7939, + "step": 1445 + }, + { + "epoch": 0.36, + "learning_rate": 0.00016382120845534497, + "loss": 2.7814, + "step": 1450 + }, + { + "epoch": 0.36, + "learning_rate": 0.00016342855975468135, + "loss": 2.7271, + "step": 1455 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001630351291949442, + "loss": 2.8333, + "step": 1460 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001626409228161089, + "loss": 2.8128, + "step": 1465 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001622459466700615, + "loss": 2.7752, + "step": 1470 + }, + { + "epoch": 0.36, + "learning_rate": 0.00016185020682050541, + "loss": 2.8474, + "step": 1475 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001614537093428685, + "loss": 2.8078, + "step": 1480 + }, + { + "epoch": 0.37, + "learning_rate": 0.00016105646032420982, + "loss": 2.7696, + "step": 1485 + }, + { + "epoch": 0.37, + "learning_rate": 0.00016065846586312617, + "loss": 2.8652, + "step": 1490 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001602597320696584, + "loss": 2.7888, + "step": 1495 + }, + { + "epoch": 0.37, + "learning_rate": 0.00015986026506519755, + "loss": 2.7349, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 4059, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.8026854820610048e+19, + "train_batch_size": 18, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2e45346 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7338f8c5d7e5e87e4fe2f9db5fec5f4c807ae1d04e6511b11790439862115f76 +size 5240