commit 4aca8c28126df7ef79affac13ad1c0ccb9f741db Author: ModelHub XC Date: Sat May 9 12:48:30 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Yousefbahr/Turjman-Cold-Start Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b6153b4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..bf13763 --- /dev/null +++ b/README.md @@ -0,0 +1,227 @@ +--- +base_model: +- CohereLabs/aya-expanse-8b +pipeline_tag: text-generation +tags: +- conversational +- translation +- text-generation-inference +--- + +# Model Card for Model ID + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "Yousefbahr/Turjman-Cold-Start" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id) + +# Format the message with the chat template +messages = [{"role": "user", "content": "Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz"}] +input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") +## <|START_OF_TURN_TOKEN|><|USER_TOKEN|>Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> + +gen_tokens = model.generate( + input_ids, + max_new_tokens=100, + do_sample=True, + temperature=0.3, + ) + +gen_text = tokenizer.decode(gen_tokens[0]) +print(gen_text) + +``` + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..eea053c --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1 @@ +{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Aya, a brilliant, sophisticated, multilingual AI-assistant trained to assist human users by providing thorough responses. You are able to interact and respond to questions in 23 languages and you are powered by a multilingual model built by Cohere For AI.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..3891c3b --- /dev/null +++ b/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "CohereForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 5, + "dtype": "bfloat16", + "eos_token_id": 255001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "layer_norm_eps": 1e-05, + "logit_scale": 0.125, + "max_position_embeddings": 8192, + "model_type": "cohere", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 0, + "rope_parameters": { + "rope_theta": 10000, + "rope_type": "default" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": false, + "use_qk_norm": false, + "vocab_size": 256000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..ad24051 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 5, + "eos_token_id": [ + 255001 + ], + "pad_token_id": 0, + "transformers_version": "5.2.0" +} diff --git a/last-checkpoint/chat_template.jinja b/last-checkpoint/chat_template.jinja new file mode 100644 index 0000000..eea053c --- /dev/null +++ b/last-checkpoint/chat_template.jinja @@ -0,0 +1 @@ +{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Aya, a brilliant, sophisticated, multilingual AI-assistant trained to assist human users by providing thorough responses. You are able to interact and respond to questions in 23 languages and you are powered by a multilingual model built by Cohere For AI.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %} \ No newline at end of file diff --git a/last-checkpoint/config.json b/last-checkpoint/config.json new file mode 100644 index 0000000..3891c3b --- /dev/null +++ b/last-checkpoint/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "CohereForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 5, + "dtype": "bfloat16", + "eos_token_id": 255001, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "layer_norm_eps": 1e-05, + "logit_scale": 0.125, + "max_position_embeddings": 8192, + "model_type": "cohere", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 0, + "rope_parameters": { + "rope_theta": 10000, + "rope_type": "default" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": false, + "use_qk_norm": false, + "vocab_size": 256000 +} diff --git a/last-checkpoint/generation_config.json b/last-checkpoint/generation_config.json new file mode 100644 index 0000000..ad24051 --- /dev/null +++ b/last-checkpoint/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 5, + "eos_token_id": [ + 255001 + ], + "pad_token_id": 0, + "transformers_version": "5.2.0" +} diff --git a/last-checkpoint/model.safetensors b/last-checkpoint/model.safetensors new file mode 100644 index 0000000..6bd3f61 --- /dev/null +++ b/last-checkpoint/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d874b039ecc399abb9bd66ffe499e41dc0d648bc19f5df44b3f5872071a74c +size 16056096184 diff --git a/last-checkpoint/tokenizer.json b/last-checkpoint/tokenizer.json new file mode 100644 index 0000000..4bf8c81 --- /dev/null +++ b/last-checkpoint/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72cb57ce120d4797ec0a544cee3792f228c9dca4cb1863e11cb341f0ec3e576 +size 20124086 diff --git a/last-checkpoint/tokenizer_config.json b/last-checkpoint/tokenizer_config.json new file mode 100644 index 0000000..8a7b27e --- /dev/null +++ b/last-checkpoint/tokenizer_config.json @@ -0,0 +1,22 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "<|END_OF_TURN_TOKEN|>", + "errors": "replace", + "is_local": true, + "legacy": true, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sep_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "CohereTokenizer", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/last-checkpoint/trainer_state.json b/last-checkpoint/trainer_state.json new file mode 100644 index 0000000..4112cde --- /dev/null +++ b/last-checkpoint/trainer_state.json @@ -0,0 +1,209 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6724949562878278, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.026899798251513115, + "grad_norm": 0.9497337273894788, + "learning_rate": 9.879032258064517e-06, + "loss": 0.21184968948364258, + "step": 10 + }, + { + "epoch": 0.05379959650302623, + "grad_norm": 0.954080750401948, + "learning_rate": 9.744623655913979e-06, + "loss": 0.21339879035949708, + "step": 20 + }, + { + "epoch": 0.08069939475453934, + "grad_norm": 0.9097806480768484, + "learning_rate": 9.610215053763442e-06, + "loss": 0.2068164110183716, + "step": 30 + }, + { + "epoch": 0.10759919300605246, + "grad_norm": 1.0018783014353256, + "learning_rate": 9.475806451612905e-06, + "loss": 0.2059403896331787, + "step": 40 + }, + { + "epoch": 0.13449899125756556, + "grad_norm": 0.9372500482771303, + "learning_rate": 9.341397849462367e-06, + "loss": 0.19799630641937255, + "step": 50 + }, + { + "epoch": 0.16139878950907868, + "grad_norm": 0.9509184109653567, + "learning_rate": 9.206989247311828e-06, + "loss": 0.2008678436279297, + "step": 60 + }, + { + "epoch": 0.1882985877605918, + "grad_norm": 1.1134282940647546, + "learning_rate": 9.072580645161291e-06, + "loss": 0.20529050827026368, + "step": 70 + }, + { + "epoch": 0.21519838601210492, + "grad_norm": 0.9233328708641031, + "learning_rate": 8.938172043010753e-06, + "loss": 0.2078157901763916, + "step": 80 + }, + { + "epoch": 0.242098184263618, + "grad_norm": 1.0158369506432157, + "learning_rate": 8.803763440860216e-06, + "loss": 0.2069852113723755, + "step": 90 + }, + { + "epoch": 0.26899798251513113, + "grad_norm": 0.9962467911152623, + "learning_rate": 8.669354838709677e-06, + "loss": 0.19123213291168212, + "step": 100 + }, + { + "epoch": 0.29589778076664425, + "grad_norm": 0.9050182554717234, + "learning_rate": 8.53494623655914e-06, + "loss": 0.1998592734336853, + "step": 110 + }, + { + "epoch": 0.32279757901815737, + "grad_norm": 1.16989042706525, + "learning_rate": 8.400537634408604e-06, + "loss": 0.19017333984375, + "step": 120 + }, + { + "epoch": 0.3496973772696705, + "grad_norm": 0.9202065176879053, + "learning_rate": 8.266129032258065e-06, + "loss": 0.1971895694732666, + "step": 130 + }, + { + "epoch": 0.3765971755211836, + "grad_norm": 0.9882788749577818, + "learning_rate": 8.131720430107529e-06, + "loss": 0.20348339080810546, + "step": 140 + }, + { + "epoch": 0.4034969737726967, + "grad_norm": 0.9666836025801647, + "learning_rate": 7.99731182795699e-06, + "loss": 0.18486206531524657, + "step": 150 + }, + { + "epoch": 0.43039677202420984, + "grad_norm": 0.8578034645388888, + "learning_rate": 7.862903225806451e-06, + "loss": 0.19098201990127564, + "step": 160 + }, + { + "epoch": 0.45729657027572296, + "grad_norm": 0.9758615729928365, + "learning_rate": 7.728494623655915e-06, + "loss": 0.18409807682037355, + "step": 170 + }, + { + "epoch": 0.484196368527236, + "grad_norm": 0.9564910398607444, + "learning_rate": 7.594086021505377e-06, + "loss": 0.20637857913970947, + "step": 180 + }, + { + "epoch": 0.5110961667787491, + "grad_norm": 1.09919047792078, + "learning_rate": 7.459677419354839e-06, + "loss": 0.1931600570678711, + "step": 190 + }, + { + "epoch": 0.5379959650302623, + "grad_norm": 1.0745145882317149, + "learning_rate": 7.325268817204302e-06, + "loss": 0.19824893474578859, + "step": 200 + }, + { + "epoch": 0.5648957632817754, + "grad_norm": 0.9633608114145525, + "learning_rate": 7.190860215053764e-06, + "loss": 0.2040123701095581, + "step": 210 + }, + { + "epoch": 0.5917955615332885, + "grad_norm": 1.0540947069171986, + "learning_rate": 7.056451612903227e-06, + "loss": 0.200044584274292, + "step": 220 + }, + { + "epoch": 0.6186953597848016, + "grad_norm": 0.9784264211901752, + "learning_rate": 6.9220430107526895e-06, + "loss": 0.2000995397567749, + "step": 230 + }, + { + "epoch": 0.6455951580363147, + "grad_norm": 1.1286177734944867, + "learning_rate": 6.787634408602151e-06, + "loss": 0.1975030779838562, + "step": 240 + }, + { + "epoch": 0.6724949562878278, + "grad_norm": 1.0140389237975715, + "learning_rate": 6.653225806451613e-06, + "loss": 0.1995392322540283, + "step": 250 + } + ], + "logging_steps": 10, + "max_steps": 744, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2257087168512.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/last-checkpoint/training_args.bin b/last-checkpoint/training_args.bin new file mode 100644 index 0000000..cb91bcd --- /dev/null +++ b/last-checkpoint/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f60116ab3a4196b53fe018135cc3143df6799b220c9919983aefb8dfae4e1a +size 6904 diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..6bd3f61 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d874b039ecc399abb9bd66ffe499e41dc0d648bc19f5df44b3f5872071a74c +size 16056096184 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..4bf8c81 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72cb57ce120d4797ec0a544cee3792f228c9dca4cb1863e11cb341f0ec3e576 +size 20124086 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8a7b27e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,22 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "<|END_OF_TURN_TOKEN|>", + "errors": "replace", + "is_local": true, + "legacy": true, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sep_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "CohereTokenizer", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..126a2a7 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,100 @@ +{"current_steps": 10, "total_steps": 744, "loss": 0.8225662231445312, "lr": 9.879032258064517e-06, "epoch": 0.026899798251513115, "percentage": 1.34, "elapsed_time": "0:00:52", "remaining_time": "1:03:40"} +{"current_steps": 20, "total_steps": 744, "loss": 0.5425550460815429, "lr": 9.744623655913979e-06, "epoch": 0.05379959650302623, "percentage": 2.69, "elapsed_time": "0:01:43", "remaining_time": "1:02:28"} +{"current_steps": 30, "total_steps": 744, "loss": 0.49512438774108886, "lr": 9.610215053763442e-06, "epoch": 0.08069939475453934, "percentage": 4.03, "elapsed_time": "0:02:34", "remaining_time": "1:01:11"} +{"current_steps": 40, "total_steps": 744, "loss": 0.46259517669677735, "lr": 9.475806451612905e-06, "epoch": 0.10759919300605246, "percentage": 5.38, "elapsed_time": "0:03:24", "remaining_time": "1:00:06"} +{"current_steps": 50, "total_steps": 744, "loss": 0.4650130271911621, "lr": 9.341397849462367e-06, "epoch": 0.13449899125756556, "percentage": 6.72, "elapsed_time": "0:04:14", "remaining_time": "0:58:57"} +{"current_steps": 60, "total_steps": 744, "loss": 0.4469588756561279, "lr": 9.206989247311828e-06, "epoch": 0.16139878950907868, "percentage": 8.06, "elapsed_time": "0:05:04", "remaining_time": "0:57:52"} +{"current_steps": 70, "total_steps": 744, "loss": 0.4584482192993164, "lr": 9.072580645161291e-06, "epoch": 0.1882985877605918, "percentage": 9.41, "elapsed_time": "0:05:54", "remaining_time": "0:56:53"} +{"current_steps": 80, "total_steps": 744, "loss": 0.4443532943725586, "lr": 8.938172043010753e-06, "epoch": 0.21519838601210492, "percentage": 10.75, "elapsed_time": "0:06:45", "remaining_time": "0:56:06"} +{"current_steps": 90, "total_steps": 744, "loss": 0.43974981307983396, "lr": 8.803763440860216e-06, "epoch": 0.242098184263618, "percentage": 12.1, "elapsed_time": "0:07:35", "remaining_time": "0:55:08"} +{"current_steps": 100, "total_steps": 744, "loss": 0.4207331657409668, "lr": 8.669354838709677e-06, "epoch": 0.26899798251513113, "percentage": 13.44, "elapsed_time": "0:08:24", "remaining_time": "0:54:11"} +{"current_steps": 110, "total_steps": 744, "loss": 0.4176794052124023, "lr": 8.53494623655914e-06, "epoch": 0.29589778076664425, "percentage": 14.78, "elapsed_time": "0:09:15", "remaining_time": "0:53:19"} +{"current_steps": 120, "total_steps": 744, "loss": 0.4135931968688965, "lr": 8.400537634408604e-06, "epoch": 0.32279757901815737, "percentage": 16.13, "elapsed_time": "0:10:04", "remaining_time": "0:52:25"} +{"current_steps": 130, "total_steps": 744, "loss": 0.40773611068725585, "lr": 8.266129032258065e-06, "epoch": 0.3496973772696705, "percentage": 17.47, "elapsed_time": "0:10:55", "remaining_time": "0:51:37"} +{"current_steps": 140, "total_steps": 744, "loss": 0.40926170349121094, "lr": 8.131720430107529e-06, "epoch": 0.3765971755211836, "percentage": 18.82, "elapsed_time": "0:11:46", "remaining_time": "0:50:48"} +{"current_steps": 150, "total_steps": 744, "loss": 0.3898899078369141, "lr": 7.99731182795699e-06, "epoch": 0.4034969737726967, "percentage": 20.16, "elapsed_time": "0:12:37", "remaining_time": "0:49:58"} +{"current_steps": 160, "total_steps": 744, "loss": 0.4007321834564209, "lr": 7.862903225806451e-06, "epoch": 0.43039677202420984, "percentage": 21.51, "elapsed_time": "0:13:27", "remaining_time": "0:49:06"} +{"current_steps": 170, "total_steps": 744, "loss": 0.38925461769104003, "lr": 7.728494623655915e-06, "epoch": 0.45729657027572296, "percentage": 22.85, "elapsed_time": "0:14:16", "remaining_time": "0:48:13"} +{"current_steps": 180, "total_steps": 744, "loss": 0.40600900650024413, "lr": 7.594086021505377e-06, "epoch": 0.484196368527236, "percentage": 24.19, "elapsed_time": "0:15:08", "remaining_time": "0:47:26"} +{"current_steps": 190, "total_steps": 744, "loss": 0.39852426052093504, "lr": 7.459677419354839e-06, "epoch": 0.5110961667787491, "percentage": 25.54, "elapsed_time": "0:15:58", "remaining_time": "0:46:35"} +{"current_steps": 200, "total_steps": 744, "loss": 0.3985164642333984, "lr": 7.325268817204302e-06, "epoch": 0.5379959650302623, "percentage": 26.88, "elapsed_time": "0:16:48", "remaining_time": "0:45:43"} +{"current_steps": 210, "total_steps": 744, "loss": 0.4025214195251465, "lr": 7.190860215053764e-06, "epoch": 0.5648957632817754, "percentage": 28.23, "elapsed_time": "0:17:38", "remaining_time": "0:44:52"} +{"current_steps": 220, "total_steps": 744, "loss": 0.39273393154144287, "lr": 7.056451612903227e-06, "epoch": 0.5917955615332885, "percentage": 29.57, "elapsed_time": "0:18:29", "remaining_time": "0:44:03"} +{"current_steps": 230, "total_steps": 744, "loss": 0.3891463279724121, "lr": 6.9220430107526895e-06, "epoch": 0.6186953597848016, "percentage": 30.91, "elapsed_time": "0:19:20", "remaining_time": "0:43:12"} +{"current_steps": 240, "total_steps": 744, "loss": 0.386672830581665, "lr": 6.787634408602151e-06, "epoch": 0.6455951580363147, "percentage": 32.26, "elapsed_time": "0:20:09", "remaining_time": "0:42:20"} +{"current_steps": 250, "total_steps": 744, "loss": 0.3941431760787964, "lr": 6.653225806451613e-06, "epoch": 0.6724949562878278, "percentage": 33.6, "elapsed_time": "0:20:59", "remaining_time": "0:41:29"} +{"current_steps": 260, "total_steps": 744, "loss": 0.375079083442688, "lr": 6.518817204301076e-06, "epoch": 0.699394754539341, "percentage": 34.95, "elapsed_time": "0:22:53", "remaining_time": "0:42:36"} +{"current_steps": 270, "total_steps": 744, "loss": 0.3753389358520508, "lr": 6.384408602150538e-06, "epoch": 0.7262945527908541, "percentage": 36.29, "elapsed_time": "0:23:44", "remaining_time": "0:41:41"} +{"current_steps": 280, "total_steps": 744, "loss": 0.3938201665878296, "lr": 6.25e-06, "epoch": 0.7531943510423672, "percentage": 37.63, "elapsed_time": "0:24:35", "remaining_time": "0:40:45"} +{"current_steps": 290, "total_steps": 744, "loss": 0.3773271799087524, "lr": 6.115591397849463e-06, "epoch": 0.7800941492938803, "percentage": 38.98, "elapsed_time": "0:25:26", "remaining_time": "0:39:49"} +{"current_steps": 300, "total_steps": 744, "loss": 0.36693835258483887, "lr": 5.981182795698926e-06, "epoch": 0.8069939475453934, "percentage": 40.32, "elapsed_time": "0:26:17", "remaining_time": "0:38:54"} +{"current_steps": 310, "total_steps": 744, "loss": 0.3684189796447754, "lr": 5.846774193548388e-06, "epoch": 0.8338937457969066, "percentage": 41.67, "elapsed_time": "0:27:08", "remaining_time": "0:38:00"} +{"current_steps": 320, "total_steps": 744, "loss": 0.38952927589416503, "lr": 5.7123655913978505e-06, "epoch": 0.8607935440484197, "percentage": 43.01, "elapsed_time": "0:27:57", "remaining_time": "0:37:03"} +{"current_steps": 330, "total_steps": 744, "loss": 0.3876913547515869, "lr": 5.577956989247312e-06, "epoch": 0.8876933422999328, "percentage": 44.35, "elapsed_time": "0:28:47", "remaining_time": "0:36:07"} +{"current_steps": 340, "total_steps": 744, "loss": 0.3687325954437256, "lr": 5.443548387096774e-06, "epoch": 0.9145931405514459, "percentage": 45.7, "elapsed_time": "0:29:38", "remaining_time": "0:35:12"} +{"current_steps": 350, "total_steps": 744, "loss": 0.3701371192932129, "lr": 5.309139784946237e-06, "epoch": 0.9414929388029589, "percentage": 47.04, "elapsed_time": "0:30:30", "remaining_time": "0:34:20"} +{"current_steps": 360, "total_steps": 744, "loss": 0.37389140129089354, "lr": 5.174731182795699e-06, "epoch": 0.968392737054472, "percentage": 48.39, "elapsed_time": "0:31:21", "remaining_time": "0:33:26"} +{"current_steps": 370, "total_steps": 744, "loss": 0.37515921592712403, "lr": 5.040322580645161e-06, "epoch": 0.9952925353059852, "percentage": 49.73, "elapsed_time": "0:32:12", "remaining_time": "0:32:33"} +{"current_steps": 380, "total_steps": 744, "loss": 0.2644005298614502, "lr": 4.9059139784946245e-06, "epoch": 1.0215198386012105, "percentage": 51.08, "elapsed_time": "0:33:02", "remaining_time": "0:31:38"} +{"current_steps": 390, "total_steps": 744, "loss": 0.22325487136840821, "lr": 4.771505376344087e-06, "epoch": 1.0484196368527237, "percentage": 52.42, "elapsed_time": "0:33:51", "remaining_time": "0:30:44"} +{"current_steps": 400, "total_steps": 744, "loss": 0.20546340942382812, "lr": 4.637096774193548e-06, "epoch": 1.0753194351042368, "percentage": 53.76, "elapsed_time": "0:34:43", "remaining_time": "0:29:51"} +{"current_steps": 410, "total_steps": 744, "loss": 0.22176458835601806, "lr": 4.502688172043011e-06, "epoch": 1.10221923335575, "percentage": 55.11, "elapsed_time": "0:35:33", "remaining_time": "0:28:57"} +{"current_steps": 420, "total_steps": 744, "loss": 0.21236634254455566, "lr": 4.368279569892474e-06, "epoch": 1.129119031607263, "percentage": 56.45, "elapsed_time": "0:36:24", "remaining_time": "0:28:05"} +{"current_steps": 430, "total_steps": 744, "loss": 0.20211517810821533, "lr": 4.233870967741936e-06, "epoch": 1.1560188298587761, "percentage": 57.8, "elapsed_time": "0:37:14", "remaining_time": "0:27:11"} +{"current_steps": 440, "total_steps": 744, "loss": 0.2012125015258789, "lr": 4.0994623655913985e-06, "epoch": 1.1829186281102892, "percentage": 59.14, "elapsed_time": "0:38:03", "remaining_time": "0:26:18"} +{"current_steps": 450, "total_steps": 744, "loss": 0.21971840858459474, "lr": 3.96505376344086e-06, "epoch": 1.2098184263618024, "percentage": 60.48, "elapsed_time": "0:38:55", "remaining_time": "0:25:25"} +{"current_steps": 460, "total_steps": 744, "loss": 0.21090972423553467, "lr": 3.830645161290323e-06, "epoch": 1.2367182246133155, "percentage": 61.83, "elapsed_time": "0:39:46", "remaining_time": "0:24:33"} +{"current_steps": 470, "total_steps": 744, "loss": 0.21233777999877929, "lr": 3.6962365591397855e-06, "epoch": 1.2636180228648284, "percentage": 63.17, "elapsed_time": "0:40:36", "remaining_time": "0:23:40"} +{"current_steps": 480, "total_steps": 744, "loss": 0.21166794300079345, "lr": 3.5618279569892478e-06, "epoch": 1.2905178211163415, "percentage": 64.52, "elapsed_time": "0:41:27", "remaining_time": "0:22:48"} +{"current_steps": 490, "total_steps": 744, "loss": 0.20196475982666015, "lr": 3.4274193548387097e-06, "epoch": 1.3174176193678546, "percentage": 65.86, "elapsed_time": "0:42:18", "remaining_time": "0:21:56"} +{"current_steps": 500, "total_steps": 744, "loss": 0.20718650817871093, "lr": 3.293010752688172e-06, "epoch": 1.3443174176193677, "percentage": 67.2, "elapsed_time": "0:43:08", "remaining_time": "0:21:03"} +{"current_steps": 510, "total_steps": 744, "loss": 0.19807689189910888, "lr": 3.1586021505376348e-06, "epoch": 1.3712172158708809, "percentage": 68.55, "elapsed_time": "0:45:04", "remaining_time": "0:20:40"} +{"current_steps": 520, "total_steps": 744, "loss": 0.20845794677734375, "lr": 3.024193548387097e-06, "epoch": 1.398117014122394, "percentage": 69.89, "elapsed_time": "0:45:55", "remaining_time": "0:19:46"} +{"current_steps": 530, "total_steps": 744, "loss": 0.2050685167312622, "lr": 2.8897849462365594e-06, "epoch": 1.425016812373907, "percentage": 71.24, "elapsed_time": "0:46:45", "remaining_time": "0:18:52"} +{"current_steps": 540, "total_steps": 744, "loss": 0.1973992943763733, "lr": 2.7553763440860214e-06, "epoch": 1.4519166106254202, "percentage": 72.58, "elapsed_time": "0:47:35", "remaining_time": "0:17:58"} +{"current_steps": 550, "total_steps": 744, "loss": 0.20508060455322266, "lr": 2.620967741935484e-06, "epoch": 1.4788164088769333, "percentage": 73.92, "elapsed_time": "0:48:25", "remaining_time": "0:17:04"} +{"current_steps": 560, "total_steps": 744, "loss": 0.1983818531036377, "lr": 2.4865591397849464e-06, "epoch": 1.5057162071284464, "percentage": 75.27, "elapsed_time": "0:49:16", "remaining_time": "0:16:11"} +{"current_steps": 570, "total_steps": 744, "loss": 0.20342755317687988, "lr": 2.3521505376344088e-06, "epoch": 1.5326160053799596, "percentage": 76.61, "elapsed_time": "0:50:06", "remaining_time": "0:15:17"} +{"current_steps": 580, "total_steps": 744, "loss": 0.20011866092681885, "lr": 2.217741935483871e-06, "epoch": 1.5595158036314727, "percentage": 77.96, "elapsed_time": "0:50:56", "remaining_time": "0:14:24"} +{"current_steps": 590, "total_steps": 744, "loss": 0.19641097784042358, "lr": 2.0833333333333334e-06, "epoch": 1.5864156018829858, "percentage": 79.3, "elapsed_time": "0:51:46", "remaining_time": "0:13:30"} +{"current_steps": 600, "total_steps": 744, "loss": 0.19918395280838014, "lr": 1.9489247311827958e-06, "epoch": 1.613315400134499, "percentage": 80.65, "elapsed_time": "0:52:38", "remaining_time": "0:12:37"} +{"current_steps": 610, "total_steps": 744, "loss": 0.200819993019104, "lr": 1.8145161290322583e-06, "epoch": 1.640215198386012, "percentage": 81.99, "elapsed_time": "0:53:29", "remaining_time": "0:11:45"} +{"current_steps": 620, "total_steps": 744, "loss": 0.20298488140106202, "lr": 1.6801075268817204e-06, "epoch": 1.6671149966375252, "percentage": 83.33, "elapsed_time": "0:54:19", "remaining_time": "0:10:51"} +{"current_steps": 630, "total_steps": 744, "loss": 0.2038034677505493, "lr": 1.545698924731183e-06, "epoch": 1.6940147948890383, "percentage": 84.68, "elapsed_time": "0:55:11", "remaining_time": "0:09:59"} +{"current_steps": 640, "total_steps": 744, "loss": 0.19510786533355712, "lr": 1.4112903225806455e-06, "epoch": 1.7209145931405514, "percentage": 86.02, "elapsed_time": "0:56:00", "remaining_time": "0:09:06"} +{"current_steps": 650, "total_steps": 744, "loss": 0.193765389919281, "lr": 1.2768817204301076e-06, "epoch": 1.7478143913920645, "percentage": 87.37, "elapsed_time": "0:56:52", "remaining_time": "0:08:13"} +{"current_steps": 660, "total_steps": 744, "loss": 0.2017449378967285, "lr": 1.14247311827957e-06, "epoch": 1.7747141896435776, "percentage": 88.71, "elapsed_time": "0:57:42", "remaining_time": "0:07:20"} +{"current_steps": 670, "total_steps": 744, "loss": 0.20550622940063476, "lr": 1.0080645161290323e-06, "epoch": 1.8016139878950908, "percentage": 90.05, "elapsed_time": "0:58:32", "remaining_time": "0:06:27"} +{"current_steps": 680, "total_steps": 744, "loss": 0.20840318202972413, "lr": 8.736559139784947e-07, "epoch": 1.8285137861466039, "percentage": 91.4, "elapsed_time": "0:59:22", "remaining_time": "0:05:35"} +{"current_steps": 690, "total_steps": 744, "loss": 0.2029731512069702, "lr": 7.392473118279571e-07, "epoch": 1.855413584398117, "percentage": 92.74, "elapsed_time": "1:00:12", "remaining_time": "0:04:42"} +{"current_steps": 700, "total_steps": 744, "loss": 0.19700987339019777, "lr": 6.048387096774194e-07, "epoch": 1.88231338264963, "percentage": 94.09, "elapsed_time": "1:01:02", "remaining_time": "0:03:50"} +{"current_steps": 710, "total_steps": 744, "loss": 0.2045133590698242, "lr": 4.7043010752688173e-07, "epoch": 1.9092131809011432, "percentage": 95.43, "elapsed_time": "1:01:53", "remaining_time": "0:02:57"} +{"current_steps": 720, "total_steps": 744, "loss": 0.21047801971435548, "lr": 3.360215053763441e-07, "epoch": 1.9361129791526563, "percentage": 96.77, "elapsed_time": "1:02:45", "remaining_time": "0:02:05"} +{"current_steps": 730, "total_steps": 744, "loss": 0.18875229358673096, "lr": 2.0161290322580645e-07, "epoch": 1.9630127774041695, "percentage": 98.12, "elapsed_time": "1:03:36", "remaining_time": "0:01:13"} +{"current_steps": 740, "total_steps": 744, "loss": 0.19709365367889403, "lr": 6.720430107526882e-08, "epoch": 1.9899125756556826, "percentage": 99.46, "elapsed_time": "1:04:26", "remaining_time": "0:00:20"} +{"current_steps": 10, "total_steps": 744, "loss": 0.21184968948364258, "lr": 9.879032258064517e-06, "epoch": 0.026899798251513115, "percentage": 1.34, "elapsed_time": "0:00:52", "remaining_time": "1:03:45"} +{"current_steps": 20, "total_steps": 744, "loss": 0.21339879035949708, "lr": 9.744623655913979e-06, "epoch": 0.05379959650302623, "percentage": 2.69, "elapsed_time": "0:01:43", "remaining_time": "1:02:35"} +{"current_steps": 30, "total_steps": 744, "loss": 0.2068164110183716, "lr": 9.610215053763442e-06, "epoch": 0.08069939475453934, "percentage": 4.03, "elapsed_time": "0:02:34", "remaining_time": "1:01:20"} +{"current_steps": 40, "total_steps": 744, "loss": 0.2059403896331787, "lr": 9.475806451612905e-06, "epoch": 0.10759919300605246, "percentage": 5.38, "elapsed_time": "0:03:25", "remaining_time": "1:00:16"} +{"current_steps": 50, "total_steps": 744, "loss": 0.19799630641937255, "lr": 9.341397849462367e-06, "epoch": 0.13449899125756556, "percentage": 6.72, "elapsed_time": "0:04:15", "remaining_time": "0:59:05"} +{"current_steps": 60, "total_steps": 744, "loss": 0.2008678436279297, "lr": 9.206989247311828e-06, "epoch": 0.16139878950907868, "percentage": 8.06, "elapsed_time": "0:05:05", "remaining_time": "0:57:57"} +{"current_steps": 70, "total_steps": 744, "loss": 0.20529050827026368, "lr": 9.072580645161291e-06, "epoch": 0.1882985877605918, "percentage": 9.41, "elapsed_time": "0:05:55", "remaining_time": "0:56:59"} +{"current_steps": 80, "total_steps": 744, "loss": 0.2078157901763916, "lr": 8.938172043010753e-06, "epoch": 0.21519838601210492, "percentage": 10.75, "elapsed_time": "0:06:46", "remaining_time": "0:56:11"} +{"current_steps": 90, "total_steps": 744, "loss": 0.2069852113723755, "lr": 8.803763440860216e-06, "epoch": 0.242098184263618, "percentage": 12.1, "elapsed_time": "0:07:35", "remaining_time": "0:55:11"} +{"current_steps": 100, "total_steps": 744, "loss": 0.19123213291168212, "lr": 8.669354838709677e-06, "epoch": 0.26899798251513113, "percentage": 13.44, "elapsed_time": "0:08:25", "remaining_time": "0:54:16"} +{"current_steps": 110, "total_steps": 744, "loss": 0.1998592734336853, "lr": 8.53494623655914e-06, "epoch": 0.29589778076664425, "percentage": 14.78, "elapsed_time": "0:09:16", "remaining_time": "0:53:24"} +{"current_steps": 120, "total_steps": 744, "loss": 0.19017333984375, "lr": 8.400537634408604e-06, "epoch": 0.32279757901815737, "percentage": 16.13, "elapsed_time": "0:10:05", "remaining_time": "0:52:29"} +{"current_steps": 130, "total_steps": 744, "loss": 0.1971895694732666, "lr": 8.266129032258065e-06, "epoch": 0.3496973772696705, "percentage": 17.47, "elapsed_time": "0:10:56", "remaining_time": "0:51:41"} +{"current_steps": 140, "total_steps": 744, "loss": 0.20348339080810546, "lr": 8.131720430107529e-06, "epoch": 0.3765971755211836, "percentage": 18.82, "elapsed_time": "0:11:47", "remaining_time": "0:50:52"} +{"current_steps": 150, "total_steps": 744, "loss": 0.18486206531524657, "lr": 7.99731182795699e-06, "epoch": 0.4034969737726967, "percentage": 20.16, "elapsed_time": "0:12:38", "remaining_time": "0:50:03"} +{"current_steps": 160, "total_steps": 744, "loss": 0.19098201990127564, "lr": 7.862903225806451e-06, "epoch": 0.43039677202420984, "percentage": 21.51, "elapsed_time": "0:13:28", "remaining_time": "0:49:10"} +{"current_steps": 170, "total_steps": 744, "loss": 0.18409807682037355, "lr": 7.728494623655915e-06, "epoch": 0.45729657027572296, "percentage": 22.85, "elapsed_time": "0:14:18", "remaining_time": "0:48:17"} +{"current_steps": 180, "total_steps": 744, "loss": 0.20637857913970947, "lr": 7.594086021505377e-06, "epoch": 0.484196368527236, "percentage": 24.19, "elapsed_time": "0:15:09", "remaining_time": "0:47:30"} +{"current_steps": 190, "total_steps": 744, "loss": 0.1931600570678711, "lr": 7.459677419354839e-06, "epoch": 0.5110961667787491, "percentage": 25.54, "elapsed_time": "0:16:00", "remaining_time": "0:46:39"} +{"current_steps": 200, "total_steps": 744, "loss": 0.19824893474578859, "lr": 7.325268817204302e-06, "epoch": 0.5379959650302623, "percentage": 26.88, "elapsed_time": "0:16:50", "remaining_time": "0:45:48"} +{"current_steps": 210, "total_steps": 744, "loss": 0.2040123701095581, "lr": 7.190860215053764e-06, "epoch": 0.5648957632817754, "percentage": 28.23, "elapsed_time": "0:17:40", "remaining_time": "0:44:57"} +{"current_steps": 220, "total_steps": 744, "loss": 0.200044584274292, "lr": 7.056451612903227e-06, "epoch": 0.5917955615332885, "percentage": 29.57, "elapsed_time": "0:18:31", "remaining_time": "0:44:07"} +{"current_steps": 230, "total_steps": 744, "loss": 0.2000995397567749, "lr": 6.9220430107526895e-06, "epoch": 0.6186953597848016, "percentage": 30.91, "elapsed_time": "0:19:21", "remaining_time": "0:43:16"} +{"current_steps": 240, "total_steps": 744, "loss": 0.1975030779838562, "lr": 6.787634408602151e-06, "epoch": 0.6455951580363147, "percentage": 32.26, "elapsed_time": "0:20:11", "remaining_time": "0:42:24"} +{"current_steps": 250, "total_steps": 744, "loss": 0.1995392322540283, "lr": 6.653225806451613e-06, "epoch": 0.6724949562878278, "percentage": 33.6, "elapsed_time": "0:21:01", "remaining_time": "0:41:32"} +{"current_steps": 260, "total_steps": 744, "loss": 0.1932253360748291, "lr": 6.518817204301076e-06, "epoch": 0.699394754539341, "percentage": 34.95, "elapsed_time": "0:22:56", "remaining_time": "0:42:42"} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..cb91bcd --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f60116ab3a4196b53fe018135cc3143df6799b220c9919983aefb8dfae4e1a +size 6904