初始化项目,由ModelHub XC社区提供模型
Model: varshak1/openrubric-judgment-sft Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
60
README.md
Normal file
60
README.md
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
library_name: transformers
|
||||||
|
license: other
|
||||||
|
base_model: Qwen/Qwen3-8B
|
||||||
|
tags:
|
||||||
|
- llama-factory
|
||||||
|
- full
|
||||||
|
- generated_from_trainer
|
||||||
|
model-index:
|
||||||
|
- name: openrubric-judgment-sft
|
||||||
|
results: []
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
||||||
|
should probably proofread and complete it, then remove this comment. -->
|
||||||
|
|
||||||
|
# openrubric-judgment-sft
|
||||||
|
|
||||||
|
This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the openrubric-judgment-sft dataset.
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training and evaluation data
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training procedure
|
||||||
|
|
||||||
|
### Training hyperparameters
|
||||||
|
|
||||||
|
The following hyperparameters were used during training:
|
||||||
|
- learning_rate: 5e-06
|
||||||
|
- train_batch_size: 4
|
||||||
|
- eval_batch_size: 8
|
||||||
|
- seed: 42
|
||||||
|
- distributed_type: multi-GPU
|
||||||
|
- num_devices: 8
|
||||||
|
- gradient_accumulation_steps: 4
|
||||||
|
- total_train_batch_size: 128
|
||||||
|
- total_eval_batch_size: 64
|
||||||
|
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
||||||
|
- lr_scheduler_type: linear
|
||||||
|
- num_epochs: 2.0
|
||||||
|
|
||||||
|
### Training results
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- Transformers 5.2.0
|
||||||
|
- Pytorch 2.6.0+cu124
|
||||||
|
- Datasets 4.0.0
|
||||||
|
- Tokenizers 0.22.2
|
||||||
8
all_results.json
Normal file
8
all_results.json
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"epoch": 2.0,
|
||||||
|
"total_flos": 1.4534558685629252e+19,
|
||||||
|
"train_loss": 0.27833450065266935,
|
||||||
|
"train_runtime": 6971.6374,
|
||||||
|
"train_samples_per_second": 20.435,
|
||||||
|
"train_steps_per_second": 0.16
|
||||||
|
}
|
||||||
89
chat_template.jinja
Normal file
89
chat_template.jinja
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- messages[0].content + '\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||||
|
{%- for message in messages[::-1] %}
|
||||||
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||||
|
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||||
|
{%- set ns.multi_step_tool = false %}
|
||||||
|
{%- set ns.last_query_index = index %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if message.content is string %}
|
||||||
|
{%- set content = message.content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set content = '' %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{%- set reasoning_content = '' %}
|
||||||
|
{%- if message.reasoning_content is string %}
|
||||||
|
{%- set reasoning_content = message.reasoning_content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- if '</think>' in content %}
|
||||||
|
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||||
|
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if loop.index0 > ns.last_query_index %}
|
||||||
|
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if (loop.first and content) or (not loop.first) %}
|
||||||
|
{{- '\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tool_call.function %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{%- if tool_call.arguments is string %}
|
||||||
|
{{- tool_call.arguments }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||||
|
{{- '<think>\n\n</think>\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
71
config.json
Normal file
71
config.json
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"Qwen3ForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": null,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"eos_token_id": 151645,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 12288,
|
||||||
|
"layer_types": [
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention"
|
||||||
|
],
|
||||||
|
"max_position_embeddings": 40960,
|
||||||
|
"max_window_layers": 36,
|
||||||
|
"model_type": "qwen3",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 36,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pad_token_id": 151643,
|
||||||
|
"rms_norm_eps": 1e-06,
|
||||||
|
"rope_parameters": {
|
||||||
|
"rope_theta": 1000000,
|
||||||
|
"rope_type": "default"
|
||||||
|
},
|
||||||
|
"sliding_window": null,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"transformers_version": "5.2.0",
|
||||||
|
"use_cache": false,
|
||||||
|
"use_sliding_window": false,
|
||||||
|
"vocab_size": 151936
|
||||||
|
}
|
||||||
12
generation_config.json
Normal file
12
generation_config.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": [
|
||||||
|
151645,
|
||||||
|
151643
|
||||||
|
],
|
||||||
|
"pad_token_id": 151643,
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_k": 20,
|
||||||
|
"top_p": 0.95,
|
||||||
|
"transformers_version": "5.2.0"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:c2e7133d5ab4ee3145f94cc4abba9c8524dafa0bb4083a80a948b1da2eabfd5c
|
||||||
|
size 16381517208
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
||||||
|
size 11422650
|
||||||
15
tokenizer_config.json
Normal file
15
tokenizer_config.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"add_prefix_space": false,
|
||||||
|
"backend": "tokenizers",
|
||||||
|
"bos_token": null,
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"errors": "replace",
|
||||||
|
"is_local": false,
|
||||||
|
"model_max_length": 131072,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "Qwen2Tokenizer",
|
||||||
|
"unk_token": null
|
||||||
|
}
|
||||||
8
train_results.json
Normal file
8
train_results.json
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"epoch": 2.0,
|
||||||
|
"total_flos": 1.4534558685629252e+19,
|
||||||
|
"train_loss": 0.27833450065266935,
|
||||||
|
"train_runtime": 6971.6374,
|
||||||
|
"train_samples_per_second": 20.435,
|
||||||
|
"train_steps_per_second": 0.16
|
||||||
|
}
|
||||||
112
trainer_log.jsonl
Normal file
112
trainer_log.jsonl
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
{"current_steps": 10, "total_steps": 1114, "loss": 0.5949527740478515, "lr": 4.959605026929982e-06, "epoch": 0.017969451931716084, "percentage": 0.9, "elapsed_time": "0:01:04", "remaining_time": "1:58:49"}
|
||||||
|
{"current_steps": 20, "total_steps": 1114, "loss": 0.41325950622558594, "lr": 4.9147217235188516e-06, "epoch": 0.03593890386343217, "percentage": 1.8, "elapsed_time": "0:02:05", "remaining_time": "1:54:29"}
|
||||||
|
{"current_steps": 30, "total_steps": 1114, "loss": 0.3709533929824829, "lr": 4.86983842010772e-06, "epoch": 0.05390835579514825, "percentage": 2.69, "elapsed_time": "0:03:06", "remaining_time": "1:52:33"}
|
||||||
|
{"current_steps": 40, "total_steps": 1114, "loss": 0.3509422540664673, "lr": 4.8249551166965895e-06, "epoch": 0.07187780772686433, "percentage": 3.59, "elapsed_time": "0:04:08", "remaining_time": "1:51:07"}
|
||||||
|
{"current_steps": 50, "total_steps": 1114, "loss": 0.3454415321350098, "lr": 4.780071813285458e-06, "epoch": 0.08984725965858041, "percentage": 4.49, "elapsed_time": "0:05:09", "remaining_time": "1:49:36"}
|
||||||
|
{"current_steps": 60, "total_steps": 1114, "loss": 0.33249969482421876, "lr": 4.7351885098743274e-06, "epoch": 0.1078167115902965, "percentage": 5.39, "elapsed_time": "0:06:10", "remaining_time": "1:48:23"}
|
||||||
|
{"current_steps": 70, "total_steps": 1114, "loss": 0.3292850971221924, "lr": 4.690305206463196e-06, "epoch": 0.12578616352201258, "percentage": 6.28, "elapsed_time": "0:07:11", "remaining_time": "1:47:20"}
|
||||||
|
{"current_steps": 80, "total_steps": 1114, "loss": 0.3244313716888428, "lr": 4.6454219030520645e-06, "epoch": 0.14375561545372867, "percentage": 7.18, "elapsed_time": "0:08:12", "remaining_time": "1:46:09"}
|
||||||
|
{"current_steps": 90, "total_steps": 1114, "loss": 0.3236015558242798, "lr": 4.600538599640934e-06, "epoch": 0.16172506738544473, "percentage": 8.08, "elapsed_time": "0:09:13", "remaining_time": "1:45:02"}
|
||||||
|
{"current_steps": 100, "total_steps": 1114, "loss": 0.3194127559661865, "lr": 4.5556552962298025e-06, "epoch": 0.17969451931716082, "percentage": 8.98, "elapsed_time": "0:10:15", "remaining_time": "1:44:01"}
|
||||||
|
{"current_steps": 110, "total_steps": 1114, "loss": 0.31786675453186036, "lr": 4.510771992818672e-06, "epoch": 0.1976639712488769, "percentage": 9.87, "elapsed_time": "0:11:16", "remaining_time": "1:42:56"}
|
||||||
|
{"current_steps": 120, "total_steps": 1114, "loss": 0.3128951072692871, "lr": 4.465888689407541e-06, "epoch": 0.215633423180593, "percentage": 10.77, "elapsed_time": "0:12:18", "remaining_time": "1:41:55"}
|
||||||
|
{"current_steps": 130, "total_steps": 1114, "loss": 0.31437077522277834, "lr": 4.42100538599641e-06, "epoch": 0.23360287511230907, "percentage": 11.67, "elapsed_time": "0:13:19", "remaining_time": "1:40:55"}
|
||||||
|
{"current_steps": 140, "total_steps": 1114, "loss": 0.30969116687774656, "lr": 4.376122082585278e-06, "epoch": 0.25157232704402516, "percentage": 12.57, "elapsed_time": "0:14:21", "remaining_time": "1:39:52"}
|
||||||
|
{"current_steps": 150, "total_steps": 1114, "loss": 0.31036303043365476, "lr": 4.331238779174148e-06, "epoch": 0.2695417789757412, "percentage": 13.46, "elapsed_time": "0:15:23", "remaining_time": "1:38:53"}
|
||||||
|
{"current_steps": 160, "total_steps": 1114, "loss": 0.30779433250427246, "lr": 4.286355475763016e-06, "epoch": 0.28751123090745734, "percentage": 14.36, "elapsed_time": "0:16:24", "remaining_time": "1:37:49"}
|
||||||
|
{"current_steps": 170, "total_steps": 1114, "loss": 0.3062736511230469, "lr": 4.241472172351886e-06, "epoch": 0.3054806828391734, "percentage": 15.26, "elapsed_time": "0:17:26", "remaining_time": "1:36:49"}
|
||||||
|
{"current_steps": 180, "total_steps": 1114, "loss": 0.30041847229003904, "lr": 4.196588868940754e-06, "epoch": 0.32345013477088946, "percentage": 16.16, "elapsed_time": "0:18:27", "remaining_time": "1:35:44"}
|
||||||
|
{"current_steps": 190, "total_steps": 1114, "loss": 0.29730544090270994, "lr": 4.151705565529624e-06, "epoch": 0.3414195867026056, "percentage": 17.06, "elapsed_time": "0:19:28", "remaining_time": "1:34:41"}
|
||||||
|
{"current_steps": 200, "total_steps": 1114, "loss": 0.30029687881469724, "lr": 4.106822262118492e-06, "epoch": 0.35938903863432164, "percentage": 17.95, "elapsed_time": "0:20:29", "remaining_time": "1:33:39"}
|
||||||
|
{"current_steps": 210, "total_steps": 1114, "loss": 0.29685449600219727, "lr": 4.061938958707361e-06, "epoch": 0.37735849056603776, "percentage": 18.85, "elapsed_time": "0:21:30", "remaining_time": "1:32:35"}
|
||||||
|
{"current_steps": 220, "total_steps": 1114, "loss": 0.2990954160690308, "lr": 4.01705565529623e-06, "epoch": 0.3953279424977538, "percentage": 19.75, "elapsed_time": "0:22:31", "remaining_time": "1:31:32"}
|
||||||
|
{"current_steps": 230, "total_steps": 1114, "loss": 0.30233011245727537, "lr": 3.9721723518850995e-06, "epoch": 0.4132973944294699, "percentage": 20.65, "elapsed_time": "0:23:33", "remaining_time": "1:30:33"}
|
||||||
|
{"current_steps": 240, "total_steps": 1114, "loss": 0.2941945314407349, "lr": 3.927289048473968e-06, "epoch": 0.431266846361186, "percentage": 21.54, "elapsed_time": "0:24:34", "remaining_time": "1:29:31"}
|
||||||
|
{"current_steps": 250, "total_steps": 1114, "loss": 0.3003401279449463, "lr": 3.882405745062837e-06, "epoch": 0.44923629829290207, "percentage": 22.44, "elapsed_time": "0:25:35", "remaining_time": "1:28:28"}
|
||||||
|
{"current_steps": 260, "total_steps": 1114, "loss": 0.2935019016265869, "lr": 3.837522441651706e-06, "epoch": 0.46720575022461813, "percentage": 23.34, "elapsed_time": "0:26:36", "remaining_time": "1:27:25"}
|
||||||
|
{"current_steps": 270, "total_steps": 1114, "loss": 0.2941242218017578, "lr": 3.792639138240575e-06, "epoch": 0.48517520215633425, "percentage": 24.24, "elapsed_time": "0:27:38", "remaining_time": "1:26:24"}
|
||||||
|
{"current_steps": 280, "total_steps": 1114, "loss": 0.2936396598815918, "lr": 3.7477558348294435e-06, "epoch": 0.5031446540880503, "percentage": 25.13, "elapsed_time": "0:28:40", "remaining_time": "1:25:23"}
|
||||||
|
{"current_steps": 290, "total_steps": 1114, "loss": 0.28728442192077636, "lr": 3.702872531418313e-06, "epoch": 0.5211141060197664, "percentage": 26.03, "elapsed_time": "0:29:41", "remaining_time": "1:24:22"}
|
||||||
|
{"current_steps": 300, "total_steps": 1114, "loss": 0.29381372928619387, "lr": 3.6579892280071814e-06, "epoch": 0.5390835579514824, "percentage": 26.93, "elapsed_time": "0:30:43", "remaining_time": "1:23:21"}
|
||||||
|
{"current_steps": 310, "total_steps": 1114, "loss": 0.28871979713439944, "lr": 3.6131059245960504e-06, "epoch": 0.5570530098831986, "percentage": 27.83, "elapsed_time": "0:31:44", "remaining_time": "1:22:19"}
|
||||||
|
{"current_steps": 320, "total_steps": 1114, "loss": 0.2919660806655884, "lr": 3.5682226211849198e-06, "epoch": 0.5750224618149147, "percentage": 28.73, "elapsed_time": "0:32:45", "remaining_time": "1:21:17"}
|
||||||
|
{"current_steps": 330, "total_steps": 1114, "loss": 0.2949108600616455, "lr": 3.5233393177737883e-06, "epoch": 0.5929919137466307, "percentage": 29.62, "elapsed_time": "0:33:47", "remaining_time": "1:20:16"}
|
||||||
|
{"current_steps": 340, "total_steps": 1114, "loss": 0.28940815925598146, "lr": 3.4784560143626573e-06, "epoch": 0.6109613656783468, "percentage": 30.52, "elapsed_time": "0:34:48", "remaining_time": "1:19:14"}
|
||||||
|
{"current_steps": 350, "total_steps": 1114, "loss": 0.2877013683319092, "lr": 3.4335727109515267e-06, "epoch": 0.6289308176100629, "percentage": 31.42, "elapsed_time": "0:35:50", "remaining_time": "1:18:13"}
|
||||||
|
{"current_steps": 360, "total_steps": 1114, "loss": 0.28816981315612794, "lr": 3.3886894075403952e-06, "epoch": 0.6469002695417789, "percentage": 32.32, "elapsed_time": "0:36:51", "remaining_time": "1:17:10"}
|
||||||
|
{"current_steps": 370, "total_steps": 1114, "loss": 0.28344998359680174, "lr": 3.343806104129264e-06, "epoch": 0.6648697214734951, "percentage": 33.21, "elapsed_time": "0:37:52", "remaining_time": "1:16:09"}
|
||||||
|
{"current_steps": 380, "total_steps": 1114, "loss": 0.28453927040100097, "lr": 3.2989228007181327e-06, "epoch": 0.6828391734052112, "percentage": 34.11, "elapsed_time": "0:38:53", "remaining_time": "1:15:07"}
|
||||||
|
{"current_steps": 390, "total_steps": 1114, "loss": 0.27885701656341555, "lr": 3.254039497307002e-06, "epoch": 0.7008086253369272, "percentage": 35.01, "elapsed_time": "0:39:55", "remaining_time": "1:14:07"}
|
||||||
|
{"current_steps": 400, "total_steps": 1114, "loss": 0.2846828937530518, "lr": 3.209156193895871e-06, "epoch": 0.7187780772686433, "percentage": 35.91, "elapsed_time": "0:40:56", "remaining_time": "1:13:04"}
|
||||||
|
{"current_steps": 410, "total_steps": 1114, "loss": 0.2884047269821167, "lr": 3.1642728904847396e-06, "epoch": 0.7367475292003594, "percentage": 36.8, "elapsed_time": "0:41:57", "remaining_time": "1:12:02"}
|
||||||
|
{"current_steps": 420, "total_steps": 1114, "loss": 0.28609886169433596, "lr": 3.119389587073609e-06, "epoch": 0.7547169811320755, "percentage": 37.7, "elapsed_time": "0:42:59", "remaining_time": "1:11:01"}
|
||||||
|
{"current_steps": 430, "total_steps": 1114, "loss": 0.2808084487915039, "lr": 3.074506283662478e-06, "epoch": 0.7726864330637916, "percentage": 38.6, "elapsed_time": "0:44:00", "remaining_time": "1:09:59"}
|
||||||
|
{"current_steps": 440, "total_steps": 1114, "loss": 0.28656601905822754, "lr": 3.0296229802513465e-06, "epoch": 0.7906558849955077, "percentage": 39.5, "elapsed_time": "0:45:02", "remaining_time": "1:08:59"}
|
||||||
|
{"current_steps": 450, "total_steps": 1114, "loss": 0.2885767936706543, "lr": 2.984739676840216e-06, "epoch": 0.8086253369272237, "percentage": 40.39, "elapsed_time": "0:46:04", "remaining_time": "1:07:58"}
|
||||||
|
{"current_steps": 460, "total_steps": 1114, "loss": 0.28384861946105955, "lr": 2.939856373429085e-06, "epoch": 0.8265947888589398, "percentage": 41.29, "elapsed_time": "0:47:05", "remaining_time": "1:06:57"}
|
||||||
|
{"current_steps": 470, "total_steps": 1114, "loss": 0.28023710250854494, "lr": 2.8949730700179535e-06, "epoch": 0.8445642407906558, "percentage": 42.19, "elapsed_time": "0:48:06", "remaining_time": "1:05:55"}
|
||||||
|
{"current_steps": 480, "total_steps": 1114, "loss": 0.280789852142334, "lr": 2.8500897666068224e-06, "epoch": 0.862533692722372, "percentage": 43.09, "elapsed_time": "0:49:07", "remaining_time": "1:04:53"}
|
||||||
|
{"current_steps": 490, "total_steps": 1114, "loss": 0.27998642921447753, "lr": 2.8052064631956914e-06, "epoch": 0.8805031446540881, "percentage": 43.99, "elapsed_time": "0:50:08", "remaining_time": "1:03:51"}
|
||||||
|
{"current_steps": 500, "total_steps": 1114, "loss": 0.2860716819763184, "lr": 2.7603231597845604e-06, "epoch": 0.8984725965858041, "percentage": 44.88, "elapsed_time": "0:51:10", "remaining_time": "1:02:50"}
|
||||||
|
{"current_steps": 510, "total_steps": 1114, "loss": 0.2779590845108032, "lr": 2.715439856373429e-06, "epoch": 0.9164420485175202, "percentage": 45.78, "elapsed_time": "0:52:11", "remaining_time": "1:01:48"}
|
||||||
|
{"current_steps": 520, "total_steps": 1114, "loss": 0.2789080381393433, "lr": 2.6705565529622983e-06, "epoch": 0.9344115004492363, "percentage": 46.68, "elapsed_time": "0:53:12", "remaining_time": "1:00:47"}
|
||||||
|
{"current_steps": 530, "total_steps": 1114, "loss": 0.28540740013122556, "lr": 2.6256732495511673e-06, "epoch": 0.9523809523809523, "percentage": 47.58, "elapsed_time": "0:54:14", "remaining_time": "0:59:45"}
|
||||||
|
{"current_steps": 540, "total_steps": 1114, "loss": 0.276381254196167, "lr": 2.580789946140036e-06, "epoch": 0.9703504043126685, "percentage": 48.47, "elapsed_time": "0:55:15", "remaining_time": "0:58:44"}
|
||||||
|
{"current_steps": 550, "total_steps": 1114, "loss": 0.2832359790802002, "lr": 2.535906642728905e-06, "epoch": 0.9883198562443846, "percentage": 49.37, "elapsed_time": "0:56:17", "remaining_time": "0:57:43"}
|
||||||
|
{"current_steps": 560, "total_steps": 1114, "loss": 0.2769860029220581, "lr": 2.491023339317774e-06, "epoch": 1.005390835579515, "percentage": 50.27, "elapsed_time": "0:58:26", "remaining_time": "0:57:48"}
|
||||||
|
{"current_steps": 570, "total_steps": 1114, "loss": 0.2540097713470459, "lr": 2.4461400359066427e-06, "epoch": 1.0233602875112309, "percentage": 51.17, "elapsed_time": "0:59:26", "remaining_time": "0:56:44"}
|
||||||
|
{"current_steps": 580, "total_steps": 1114, "loss": 0.2609401226043701, "lr": 2.4012567324955117e-06, "epoch": 1.041329739442947, "percentage": 52.06, "elapsed_time": "1:00:28", "remaining_time": "0:55:40"}
|
||||||
|
{"current_steps": 590, "total_steps": 1114, "loss": 0.25025138854980467, "lr": 2.356373429084381e-06, "epoch": 1.059299191374663, "percentage": 52.96, "elapsed_time": "1:01:28", "remaining_time": "0:54:36"}
|
||||||
|
{"current_steps": 600, "total_steps": 1114, "loss": 0.25263664722442625, "lr": 2.3114901256732496e-06, "epoch": 1.0772686433063792, "percentage": 53.86, "elapsed_time": "1:02:30", "remaining_time": "0:53:32"}
|
||||||
|
{"current_steps": 610, "total_steps": 1114, "loss": 0.25940570831298826, "lr": 2.2666068222621186e-06, "epoch": 1.0952380952380953, "percentage": 54.76, "elapsed_time": "1:03:31", "remaining_time": "0:52:29"}
|
||||||
|
{"current_steps": 620, "total_steps": 1114, "loss": 0.2521126508712769, "lr": 2.2217235188509876e-06, "epoch": 1.1132075471698113, "percentage": 55.66, "elapsed_time": "1:04:33", "remaining_time": "0:51:26"}
|
||||||
|
{"current_steps": 630, "total_steps": 1114, "loss": 0.25096635818481444, "lr": 2.1768402154398565e-06, "epoch": 1.1311769991015275, "percentage": 56.55, "elapsed_time": "1:05:35", "remaining_time": "0:50:23"}
|
||||||
|
{"current_steps": 640, "total_steps": 1114, "loss": 0.2539719581604004, "lr": 2.1319569120287255e-06, "epoch": 1.1491464510332434, "percentage": 57.45, "elapsed_time": "1:06:36", "remaining_time": "0:49:20"}
|
||||||
|
{"current_steps": 650, "total_steps": 1114, "loss": 0.2542546510696411, "lr": 2.0870736086175945e-06, "epoch": 1.1671159029649596, "percentage": 58.35, "elapsed_time": "1:07:38", "remaining_time": "0:48:16"}
|
||||||
|
{"current_steps": 660, "total_steps": 1114, "loss": 0.2506051778793335, "lr": 2.0421903052064634e-06, "epoch": 1.1850853548966755, "percentage": 59.25, "elapsed_time": "1:08:38", "remaining_time": "0:47:13"}
|
||||||
|
{"current_steps": 670, "total_steps": 1114, "loss": 0.2501336336135864, "lr": 1.9973070017953324e-06, "epoch": 1.2030548068283917, "percentage": 60.14, "elapsed_time": "1:09:40", "remaining_time": "0:46:10"}
|
||||||
|
{"current_steps": 680, "total_steps": 1114, "loss": 0.2527280330657959, "lr": 1.9524236983842014e-06, "epoch": 1.221024258760108, "percentage": 61.04, "elapsed_time": "1:10:42", "remaining_time": "0:45:07"}
|
||||||
|
{"current_steps": 690, "total_steps": 1114, "loss": 0.25149285793304443, "lr": 1.9075403949730703e-06, "epoch": 1.2389937106918238, "percentage": 61.94, "elapsed_time": "1:11:44", "remaining_time": "0:44:04"}
|
||||||
|
{"current_steps": 700, "total_steps": 1114, "loss": 0.2485593795776367, "lr": 1.862657091561939e-06, "epoch": 1.25696316262354, "percentage": 62.84, "elapsed_time": "1:12:45", "remaining_time": "0:43:01"}
|
||||||
|
{"current_steps": 710, "total_steps": 1114, "loss": 0.24832606315612793, "lr": 1.817773788150808e-06, "epoch": 1.2749326145552562, "percentage": 63.73, "elapsed_time": "1:13:46", "remaining_time": "0:41:58"}
|
||||||
|
{"current_steps": 720, "total_steps": 1114, "loss": 0.2521926164627075, "lr": 1.7728904847396768e-06, "epoch": 1.2929020664869721, "percentage": 64.63, "elapsed_time": "1:14:48", "remaining_time": "0:40:56"}
|
||||||
|
{"current_steps": 730, "total_steps": 1114, "loss": 0.25048768520355225, "lr": 1.728007181328546e-06, "epoch": 1.310871518418688, "percentage": 65.53, "elapsed_time": "1:15:49", "remaining_time": "0:39:53"}
|
||||||
|
{"current_steps": 740, "total_steps": 1114, "loss": 0.24852404594421387, "lr": 1.683123877917415e-06, "epoch": 1.3288409703504043, "percentage": 66.43, "elapsed_time": "1:16:51", "remaining_time": "0:38:50"}
|
||||||
|
{"current_steps": 750, "total_steps": 1114, "loss": 0.2514226198196411, "lr": 1.6382405745062837e-06, "epoch": 1.3468104222821204, "percentage": 67.32, "elapsed_time": "1:17:51", "remaining_time": "0:37:47"}
|
||||||
|
{"current_steps": 760, "total_steps": 1114, "loss": 0.2507458686828613, "lr": 1.593357271095153e-06, "epoch": 1.3647798742138364, "percentage": 68.22, "elapsed_time": "1:18:53", "remaining_time": "0:36:44"}
|
||||||
|
{"current_steps": 770, "total_steps": 1114, "loss": 0.2497103691101074, "lr": 1.5484739676840217e-06, "epoch": 1.3827493261455526, "percentage": 69.12, "elapsed_time": "1:19:54", "remaining_time": "0:35:41"}
|
||||||
|
{"current_steps": 780, "total_steps": 1114, "loss": 0.24853968620300293, "lr": 1.5035906642728906e-06, "epoch": 1.4007187780772687, "percentage": 70.02, "elapsed_time": "1:20:54", "remaining_time": "0:34:38"}
|
||||||
|
{"current_steps": 790, "total_steps": 1114, "loss": 0.248740816116333, "lr": 1.4587073608617596e-06, "epoch": 1.4186882300089847, "percentage": 70.92, "elapsed_time": "1:21:56", "remaining_time": "0:33:36"}
|
||||||
|
{"current_steps": 800, "total_steps": 1114, "loss": 0.2534752368927002, "lr": 1.4138240574506283e-06, "epoch": 1.4366576819407009, "percentage": 71.81, "elapsed_time": "1:22:57", "remaining_time": "0:32:33"}
|
||||||
|
{"current_steps": 810, "total_steps": 1114, "loss": 0.24337444305419922, "lr": 1.3689407540394975e-06, "epoch": 1.4546271338724168, "percentage": 72.71, "elapsed_time": "1:23:58", "remaining_time": "0:31:31"}
|
||||||
|
{"current_steps": 820, "total_steps": 1114, "loss": 0.2502609729766846, "lr": 1.3240574506283663e-06, "epoch": 1.472596585804133, "percentage": 73.61, "elapsed_time": "1:25:00", "remaining_time": "0:30:28"}
|
||||||
|
{"current_steps": 830, "total_steps": 1114, "loss": 0.24652738571166993, "lr": 1.2791741472172353e-06, "epoch": 1.490566037735849, "percentage": 74.51, "elapsed_time": "1:26:01", "remaining_time": "0:29:26"}
|
||||||
|
{"current_steps": 840, "total_steps": 1114, "loss": 0.25362207889556887, "lr": 1.2342908438061042e-06, "epoch": 1.5085354896675651, "percentage": 75.4, "elapsed_time": "1:27:03", "remaining_time": "0:28:23"}
|
||||||
|
{"current_steps": 850, "total_steps": 1114, "loss": 0.24814538955688475, "lr": 1.1894075403949732e-06, "epoch": 1.5265049415992813, "percentage": 76.3, "elapsed_time": "1:28:04", "remaining_time": "0:27:21"}
|
||||||
|
{"current_steps": 860, "total_steps": 1114, "loss": 0.2517171621322632, "lr": 1.144524236983842e-06, "epoch": 1.5444743935309972, "percentage": 77.2, "elapsed_time": "1:29:06", "remaining_time": "0:26:18"}
|
||||||
|
{"current_steps": 870, "total_steps": 1114, "loss": 0.25095720291137696, "lr": 1.0996409335727111e-06, "epoch": 1.5624438454627134, "percentage": 78.1, "elapsed_time": "1:30:07", "remaining_time": "0:25:16"}
|
||||||
|
{"current_steps": 880, "total_steps": 1114, "loss": 0.25122294425964353, "lr": 1.05475763016158e-06, "epoch": 1.5804132973944296, "percentage": 78.99, "elapsed_time": "1:31:08", "remaining_time": "0:24:14"}
|
||||||
|
{"current_steps": 890, "total_steps": 1114, "loss": 0.24518187046051027, "lr": 1.0098743267504488e-06, "epoch": 1.5983827493261455, "percentage": 79.89, "elapsed_time": "1:32:10", "remaining_time": "0:23:11"}
|
||||||
|
{"current_steps": 900, "total_steps": 1114, "loss": 0.25164237022399905, "lr": 9.649910233393178e-07, "epoch": 1.6163522012578615, "percentage": 80.79, "elapsed_time": "1:33:11", "remaining_time": "0:22:09"}
|
||||||
|
{"current_steps": 910, "total_steps": 1114, "loss": 0.2497255325317383, "lr": 9.201077199281867e-07, "epoch": 1.6343216531895777, "percentage": 81.69, "elapsed_time": "1:34:12", "remaining_time": "0:21:07"}
|
||||||
|
{"current_steps": 920, "total_steps": 1114, "loss": 0.2494762897491455, "lr": 8.752244165170558e-07, "epoch": 1.6522911051212938, "percentage": 82.59, "elapsed_time": "1:35:13", "remaining_time": "0:20:04"}
|
||||||
|
{"current_steps": 930, "total_steps": 1114, "loss": 0.24840357303619384, "lr": 8.303411131059247e-07, "epoch": 1.6702605570530098, "percentage": 83.48, "elapsed_time": "1:36:15", "remaining_time": "0:19:02"}
|
||||||
|
{"current_steps": 940, "total_steps": 1114, "loss": 0.25184221267700196, "lr": 7.854578096947936e-07, "epoch": 1.688230008984726, "percentage": 84.38, "elapsed_time": "1:37:17", "remaining_time": "0:18:00"}
|
||||||
|
{"current_steps": 950, "total_steps": 1114, "loss": 0.25055861473083496, "lr": 7.405745062836626e-07, "epoch": 1.7061994609164421, "percentage": 85.28, "elapsed_time": "1:38:18", "remaining_time": "0:16:58"}
|
||||||
|
{"current_steps": 960, "total_steps": 1114, "loss": 0.25119876861572266, "lr": 6.956912028725314e-07, "epoch": 1.724168912848158, "percentage": 86.18, "elapsed_time": "1:39:19", "remaining_time": "0:15:55"}
|
||||||
|
{"current_steps": 970, "total_steps": 1114, "loss": 0.2516517162322998, "lr": 6.508078994614005e-07, "epoch": 1.742138364779874, "percentage": 87.07, "elapsed_time": "1:40:20", "remaining_time": "0:14:53"}
|
||||||
|
{"current_steps": 980, "total_steps": 1114, "loss": 0.250733470916748, "lr": 6.059245960502694e-07, "epoch": 1.7601078167115904, "percentage": 87.97, "elapsed_time": "1:41:22", "remaining_time": "0:13:51"}
|
||||||
|
{"current_steps": 990, "total_steps": 1114, "loss": 0.2478208065032959, "lr": 5.610412926391383e-07, "epoch": 1.7780772686433064, "percentage": 88.87, "elapsed_time": "1:42:23", "remaining_time": "0:12:49"}
|
||||||
|
{"current_steps": 1000, "total_steps": 1114, "loss": 0.24935145378112794, "lr": 5.161579892280072e-07, "epoch": 1.7960467205750223, "percentage": 89.77, "elapsed_time": "1:43:24", "remaining_time": "0:11:47"}
|
||||||
|
{"current_steps": 1010, "total_steps": 1114, "loss": 0.24984090328216552, "lr": 4.7127468581687615e-07, "epoch": 1.8140161725067385, "percentage": 90.66, "elapsed_time": "1:44:26", "remaining_time": "0:10:45"}
|
||||||
|
{"current_steps": 1020, "total_steps": 1114, "loss": 0.24917204380035402, "lr": 4.2639138240574507e-07, "epoch": 1.8319856244384547, "percentage": 91.56, "elapsed_time": "1:45:27", "remaining_time": "0:09:43"}
|
||||||
|
{"current_steps": 1030, "total_steps": 1114, "loss": 0.24739840030670165, "lr": 3.815080789946141e-07, "epoch": 1.8499550763701706, "percentage": 92.46, "elapsed_time": "1:46:28", "remaining_time": "0:08:40"}
|
||||||
|
{"current_steps": 1040, "total_steps": 1114, "loss": 0.24694859981536865, "lr": 3.3662477558348295e-07, "epoch": 1.8679245283018868, "percentage": 93.36, "elapsed_time": "1:47:30", "remaining_time": "0:07:38"}
|
||||||
|
{"current_steps": 1050, "total_steps": 1114, "loss": 0.24703009128570558, "lr": 2.917414721723519e-07, "epoch": 1.885893980233603, "percentage": 94.25, "elapsed_time": "1:48:32", "remaining_time": "0:06:36"}
|
||||||
|
{"current_steps": 1060, "total_steps": 1114, "loss": 0.2495879650115967, "lr": 2.4685816876122083e-07, "epoch": 1.903863432165319, "percentage": 95.15, "elapsed_time": "1:49:33", "remaining_time": "0:05:34"}
|
||||||
|
{"current_steps": 1070, "total_steps": 1114, "loss": 0.25526316165924073, "lr": 2.0197486535008978e-07, "epoch": 1.921832884097035, "percentage": 96.05, "elapsed_time": "1:50:34", "remaining_time": "0:04:32"}
|
||||||
|
{"current_steps": 1080, "total_steps": 1114, "loss": 0.24767594337463378, "lr": 1.5709156193895872e-07, "epoch": 1.939802336028751, "percentage": 96.95, "elapsed_time": "1:51:35", "remaining_time": "0:03:30"}
|
||||||
|
{"current_steps": 1090, "total_steps": 1114, "loss": 0.24955098628997802, "lr": 1.1220825852782766e-07, "epoch": 1.9577717879604672, "percentage": 97.85, "elapsed_time": "1:52:38", "remaining_time": "0:02:28"}
|
||||||
|
{"current_steps": 1100, "total_steps": 1114, "loss": 0.2489546775817871, "lr": 6.732495511669659e-08, "epoch": 1.9757412398921832, "percentage": 98.74, "elapsed_time": "1:53:39", "remaining_time": "0:01:26"}
|
||||||
|
{"current_steps": 1110, "total_steps": 1114, "loss": 0.24430301189422607, "lr": 2.2441651705565532e-08, "epoch": 1.9937106918238994, "percentage": 99.64, "elapsed_time": "1:54:40", "remaining_time": "0:00:24"}
|
||||||
|
{"current_steps": 1114, "total_steps": 1114, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "1:56:10", "remaining_time": "0:00:00"}
|
||||||
820
trainer_state.json
Normal file
820
trainer_state.json
Normal file
@@ -0,0 +1,820 @@
|
|||||||
|
{
|
||||||
|
"best_global_step": null,
|
||||||
|
"best_metric": null,
|
||||||
|
"best_model_checkpoint": null,
|
||||||
|
"epoch": 2.0,
|
||||||
|
"eval_steps": 500,
|
||||||
|
"global_step": 1114,
|
||||||
|
"is_hyper_param_search": false,
|
||||||
|
"is_local_process_zero": true,
|
||||||
|
"is_world_process_zero": true,
|
||||||
|
"log_history": [
|
||||||
|
{
|
||||||
|
"epoch": 0.017969451931716084,
|
||||||
|
"grad_norm": 1.0288746356964111,
|
||||||
|
"learning_rate": 4.959605026929982e-06,
|
||||||
|
"loss": 0.5949527740478515,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.03593890386343217,
|
||||||
|
"grad_norm": 0.6101402044296265,
|
||||||
|
"learning_rate": 4.9147217235188516e-06,
|
||||||
|
"loss": 0.41325950622558594,
|
||||||
|
"step": 20
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.05390835579514825,
|
||||||
|
"grad_norm": 0.5305581092834473,
|
||||||
|
"learning_rate": 4.86983842010772e-06,
|
||||||
|
"loss": 0.3709533929824829,
|
||||||
|
"step": 30
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.07187780772686433,
|
||||||
|
"grad_norm": 0.5169686675071716,
|
||||||
|
"learning_rate": 4.8249551166965895e-06,
|
||||||
|
"loss": 0.3509422540664673,
|
||||||
|
"step": 40
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08984725965858041,
|
||||||
|
"grad_norm": 0.5222465991973877,
|
||||||
|
"learning_rate": 4.780071813285458e-06,
|
||||||
|
"loss": 0.3454415321350098,
|
||||||
|
"step": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.1078167115902965,
|
||||||
|
"grad_norm": 0.4856426417827606,
|
||||||
|
"learning_rate": 4.7351885098743274e-06,
|
||||||
|
"loss": 0.33249969482421876,
|
||||||
|
"step": 60
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.12578616352201258,
|
||||||
|
"grad_norm": 0.5357626676559448,
|
||||||
|
"learning_rate": 4.690305206463196e-06,
|
||||||
|
"loss": 0.3292850971221924,
|
||||||
|
"step": 70
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.14375561545372867,
|
||||||
|
"grad_norm": 0.46837398409843445,
|
||||||
|
"learning_rate": 4.6454219030520645e-06,
|
||||||
|
"loss": 0.3244313716888428,
|
||||||
|
"step": 80
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.16172506738544473,
|
||||||
|
"grad_norm": 0.48174625635147095,
|
||||||
|
"learning_rate": 4.600538599640934e-06,
|
||||||
|
"loss": 0.3236015558242798,
|
||||||
|
"step": 90
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.17969451931716082,
|
||||||
|
"grad_norm": 0.5259532332420349,
|
||||||
|
"learning_rate": 4.5556552962298025e-06,
|
||||||
|
"loss": 0.3194127559661865,
|
||||||
|
"step": 100
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.1976639712488769,
|
||||||
|
"grad_norm": 0.5332797765731812,
|
||||||
|
"learning_rate": 4.510771992818672e-06,
|
||||||
|
"loss": 0.31786675453186036,
|
||||||
|
"step": 110
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.215633423180593,
|
||||||
|
"grad_norm": 0.4909115433692932,
|
||||||
|
"learning_rate": 4.465888689407541e-06,
|
||||||
|
"loss": 0.3128951072692871,
|
||||||
|
"step": 120
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.23360287511230907,
|
||||||
|
"grad_norm": 0.4780581295490265,
|
||||||
|
"learning_rate": 4.42100538599641e-06,
|
||||||
|
"loss": 0.31437077522277834,
|
||||||
|
"step": 130
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.25157232704402516,
|
||||||
|
"grad_norm": 0.5149464011192322,
|
||||||
|
"learning_rate": 4.376122082585278e-06,
|
||||||
|
"loss": 0.30969116687774656,
|
||||||
|
"step": 140
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.2695417789757412,
|
||||||
|
"grad_norm": 0.4815337657928467,
|
||||||
|
"learning_rate": 4.331238779174148e-06,
|
||||||
|
"loss": 0.31036303043365476,
|
||||||
|
"step": 150
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.28751123090745734,
|
||||||
|
"grad_norm": 0.4882141053676605,
|
||||||
|
"learning_rate": 4.286355475763016e-06,
|
||||||
|
"loss": 0.30779433250427246,
|
||||||
|
"step": 160
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3054806828391734,
|
||||||
|
"grad_norm": 0.47035127878189087,
|
||||||
|
"learning_rate": 4.241472172351886e-06,
|
||||||
|
"loss": 0.3062736511230469,
|
||||||
|
"step": 170
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.32345013477088946,
|
||||||
|
"grad_norm": 0.47444701194763184,
|
||||||
|
"learning_rate": 4.196588868940754e-06,
|
||||||
|
"loss": 0.30041847229003904,
|
||||||
|
"step": 180
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3414195867026056,
|
||||||
|
"grad_norm": 0.4834694564342499,
|
||||||
|
"learning_rate": 4.151705565529624e-06,
|
||||||
|
"loss": 0.29730544090270994,
|
||||||
|
"step": 190
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.35938903863432164,
|
||||||
|
"grad_norm": 0.508245587348938,
|
||||||
|
"learning_rate": 4.106822262118492e-06,
|
||||||
|
"loss": 0.30029687881469724,
|
||||||
|
"step": 200
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.37735849056603776,
|
||||||
|
"grad_norm": 0.48643767833709717,
|
||||||
|
"learning_rate": 4.061938958707361e-06,
|
||||||
|
"loss": 0.29685449600219727,
|
||||||
|
"step": 210
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3953279424977538,
|
||||||
|
"grad_norm": 0.4577917456626892,
|
||||||
|
"learning_rate": 4.01705565529623e-06,
|
||||||
|
"loss": 0.2990954160690308,
|
||||||
|
"step": 220
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.4132973944294699,
|
||||||
|
"grad_norm": 0.5375077724456787,
|
||||||
|
"learning_rate": 3.9721723518850995e-06,
|
||||||
|
"loss": 0.30233011245727537,
|
||||||
|
"step": 230
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.431266846361186,
|
||||||
|
"grad_norm": 0.4925467371940613,
|
||||||
|
"learning_rate": 3.927289048473968e-06,
|
||||||
|
"loss": 0.2941945314407349,
|
||||||
|
"step": 240
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.44923629829290207,
|
||||||
|
"grad_norm": 0.5110061168670654,
|
||||||
|
"learning_rate": 3.882405745062837e-06,
|
||||||
|
"loss": 0.3003401279449463,
|
||||||
|
"step": 250
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.46720575022461813,
|
||||||
|
"grad_norm": 0.44966429471969604,
|
||||||
|
"learning_rate": 3.837522441651706e-06,
|
||||||
|
"loss": 0.2935019016265869,
|
||||||
|
"step": 260
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.48517520215633425,
|
||||||
|
"grad_norm": 0.49473223090171814,
|
||||||
|
"learning_rate": 3.792639138240575e-06,
|
||||||
|
"loss": 0.2941242218017578,
|
||||||
|
"step": 270
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5031446540880503,
|
||||||
|
"grad_norm": 0.4826172888278961,
|
||||||
|
"learning_rate": 3.7477558348294435e-06,
|
||||||
|
"loss": 0.2936396598815918,
|
||||||
|
"step": 280
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5211141060197664,
|
||||||
|
"grad_norm": 0.5087786316871643,
|
||||||
|
"learning_rate": 3.702872531418313e-06,
|
||||||
|
"loss": 0.28728442192077636,
|
||||||
|
"step": 290
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5390835579514824,
|
||||||
|
"grad_norm": 0.45754265785217285,
|
||||||
|
"learning_rate": 3.6579892280071814e-06,
|
||||||
|
"loss": 0.29381372928619387,
|
||||||
|
"step": 300
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5570530098831986,
|
||||||
|
"grad_norm": 0.47864410281181335,
|
||||||
|
"learning_rate": 3.6131059245960504e-06,
|
||||||
|
"loss": 0.28871979713439944,
|
||||||
|
"step": 310
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5750224618149147,
|
||||||
|
"grad_norm": 0.446613073348999,
|
||||||
|
"learning_rate": 3.5682226211849198e-06,
|
||||||
|
"loss": 0.2919660806655884,
|
||||||
|
"step": 320
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5929919137466307,
|
||||||
|
"grad_norm": 0.5203211903572083,
|
||||||
|
"learning_rate": 3.5233393177737883e-06,
|
||||||
|
"loss": 0.2949108600616455,
|
||||||
|
"step": 330
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6109613656783468,
|
||||||
|
"grad_norm": 0.5251737236976624,
|
||||||
|
"learning_rate": 3.4784560143626573e-06,
|
||||||
|
"loss": 0.28940815925598146,
|
||||||
|
"step": 340
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6289308176100629,
|
||||||
|
"grad_norm": 0.4626797139644623,
|
||||||
|
"learning_rate": 3.4335727109515267e-06,
|
||||||
|
"loss": 0.2877013683319092,
|
||||||
|
"step": 350
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6469002695417789,
|
||||||
|
"grad_norm": 0.5425576567649841,
|
||||||
|
"learning_rate": 3.3886894075403952e-06,
|
||||||
|
"loss": 0.28816981315612794,
|
||||||
|
"step": 360
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6648697214734951,
|
||||||
|
"grad_norm": 0.5507893562316895,
|
||||||
|
"learning_rate": 3.343806104129264e-06,
|
||||||
|
"loss": 0.28344998359680174,
|
||||||
|
"step": 370
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6828391734052112,
|
||||||
|
"grad_norm": 0.45895373821258545,
|
||||||
|
"learning_rate": 3.2989228007181327e-06,
|
||||||
|
"loss": 0.28453927040100097,
|
||||||
|
"step": 380
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7008086253369272,
|
||||||
|
"grad_norm": 0.47491055727005005,
|
||||||
|
"learning_rate": 3.254039497307002e-06,
|
||||||
|
"loss": 0.27885701656341555,
|
||||||
|
"step": 390
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7187780772686433,
|
||||||
|
"grad_norm": 0.4567403793334961,
|
||||||
|
"learning_rate": 3.209156193895871e-06,
|
||||||
|
"loss": 0.2846828937530518,
|
||||||
|
"step": 400
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7367475292003594,
|
||||||
|
"grad_norm": 0.506420910358429,
|
||||||
|
"learning_rate": 3.1642728904847396e-06,
|
||||||
|
"loss": 0.2884047269821167,
|
||||||
|
"step": 410
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7547169811320755,
|
||||||
|
"grad_norm": 0.4960302710533142,
|
||||||
|
"learning_rate": 3.119389587073609e-06,
|
||||||
|
"loss": 0.28609886169433596,
|
||||||
|
"step": 420
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7726864330637916,
|
||||||
|
"grad_norm": 0.44618239998817444,
|
||||||
|
"learning_rate": 3.074506283662478e-06,
|
||||||
|
"loss": 0.2808084487915039,
|
||||||
|
"step": 430
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7906558849955077,
|
||||||
|
"grad_norm": 0.45904698967933655,
|
||||||
|
"learning_rate": 3.0296229802513465e-06,
|
||||||
|
"loss": 0.28656601905822754,
|
||||||
|
"step": 440
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8086253369272237,
|
||||||
|
"grad_norm": 0.5420985817909241,
|
||||||
|
"learning_rate": 2.984739676840216e-06,
|
||||||
|
"loss": 0.2885767936706543,
|
||||||
|
"step": 450
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8265947888589398,
|
||||||
|
"grad_norm": 0.49061647057533264,
|
||||||
|
"learning_rate": 2.939856373429085e-06,
|
||||||
|
"loss": 0.28384861946105955,
|
||||||
|
"step": 460
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8445642407906558,
|
||||||
|
"grad_norm": 0.5167312026023865,
|
||||||
|
"learning_rate": 2.8949730700179535e-06,
|
||||||
|
"loss": 0.28023710250854494,
|
||||||
|
"step": 470
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.862533692722372,
|
||||||
|
"grad_norm": 0.46029844880104065,
|
||||||
|
"learning_rate": 2.8500897666068224e-06,
|
||||||
|
"loss": 0.280789852142334,
|
||||||
|
"step": 480
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8805031446540881,
|
||||||
|
"grad_norm": 0.44982901215553284,
|
||||||
|
"learning_rate": 2.8052064631956914e-06,
|
||||||
|
"loss": 0.27998642921447753,
|
||||||
|
"step": 490
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8984725965858041,
|
||||||
|
"grad_norm": 0.4832385182380676,
|
||||||
|
"learning_rate": 2.7603231597845604e-06,
|
||||||
|
"loss": 0.2860716819763184,
|
||||||
|
"step": 500
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9164420485175202,
|
||||||
|
"grad_norm": 0.5139860510826111,
|
||||||
|
"learning_rate": 2.715439856373429e-06,
|
||||||
|
"loss": 0.2779590845108032,
|
||||||
|
"step": 510
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9344115004492363,
|
||||||
|
"grad_norm": 0.4550414979457855,
|
||||||
|
"learning_rate": 2.6705565529622983e-06,
|
||||||
|
"loss": 0.2789080381393433,
|
||||||
|
"step": 520
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9523809523809523,
|
||||||
|
"grad_norm": 0.4613369107246399,
|
||||||
|
"learning_rate": 2.6256732495511673e-06,
|
||||||
|
"loss": 0.28540740013122556,
|
||||||
|
"step": 530
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9703504043126685,
|
||||||
|
"grad_norm": 0.45095086097717285,
|
||||||
|
"learning_rate": 2.580789946140036e-06,
|
||||||
|
"loss": 0.276381254196167,
|
||||||
|
"step": 540
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9883198562443846,
|
||||||
|
"grad_norm": 0.48203322291374207,
|
||||||
|
"learning_rate": 2.535906642728905e-06,
|
||||||
|
"loss": 0.2832359790802002,
|
||||||
|
"step": 550
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.005390835579515,
|
||||||
|
"grad_norm": 0.4708728492259979,
|
||||||
|
"learning_rate": 2.491023339317774e-06,
|
||||||
|
"loss": 0.2769860029220581,
|
||||||
|
"step": 560
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.0233602875112309,
|
||||||
|
"grad_norm": 0.4912715554237366,
|
||||||
|
"learning_rate": 2.4461400359066427e-06,
|
||||||
|
"loss": 0.2540097713470459,
|
||||||
|
"step": 570
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.041329739442947,
|
||||||
|
"grad_norm": 0.48824694752693176,
|
||||||
|
"learning_rate": 2.4012567324955117e-06,
|
||||||
|
"loss": 0.2609401226043701,
|
||||||
|
"step": 580
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.059299191374663,
|
||||||
|
"grad_norm": 0.4870210289955139,
|
||||||
|
"learning_rate": 2.356373429084381e-06,
|
||||||
|
"loss": 0.25025138854980467,
|
||||||
|
"step": 590
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.0772686433063792,
|
||||||
|
"grad_norm": 0.5163658261299133,
|
||||||
|
"learning_rate": 2.3114901256732496e-06,
|
||||||
|
"loss": 0.25263664722442625,
|
||||||
|
"step": 600
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.0952380952380953,
|
||||||
|
"grad_norm": 0.5006254315376282,
|
||||||
|
"learning_rate": 2.2666068222621186e-06,
|
||||||
|
"loss": 0.25940570831298826,
|
||||||
|
"step": 610
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.1132075471698113,
|
||||||
|
"grad_norm": 0.511043131351471,
|
||||||
|
"learning_rate": 2.2217235188509876e-06,
|
||||||
|
"loss": 0.2521126508712769,
|
||||||
|
"step": 620
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.1311769991015275,
|
||||||
|
"grad_norm": 0.49282217025756836,
|
||||||
|
"learning_rate": 2.1768402154398565e-06,
|
||||||
|
"loss": 0.25096635818481444,
|
||||||
|
"step": 630
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.1491464510332434,
|
||||||
|
"grad_norm": 0.5031591653823853,
|
||||||
|
"learning_rate": 2.1319569120287255e-06,
|
||||||
|
"loss": 0.2539719581604004,
|
||||||
|
"step": 640
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.1671159029649596,
|
||||||
|
"grad_norm": 0.5004000067710876,
|
||||||
|
"learning_rate": 2.0870736086175945e-06,
|
||||||
|
"loss": 0.2542546510696411,
|
||||||
|
"step": 650
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.1850853548966755,
|
||||||
|
"grad_norm": 0.47906896471977234,
|
||||||
|
"learning_rate": 2.0421903052064634e-06,
|
||||||
|
"loss": 0.2506051778793335,
|
||||||
|
"step": 660
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.2030548068283917,
|
||||||
|
"grad_norm": 0.5111077427864075,
|
||||||
|
"learning_rate": 1.9973070017953324e-06,
|
||||||
|
"loss": 0.2501336336135864,
|
||||||
|
"step": 670
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.221024258760108,
|
||||||
|
"grad_norm": 0.46470290422439575,
|
||||||
|
"learning_rate": 1.9524236983842014e-06,
|
||||||
|
"loss": 0.2527280330657959,
|
||||||
|
"step": 680
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.2389937106918238,
|
||||||
|
"grad_norm": 0.49279844760894775,
|
||||||
|
"learning_rate": 1.9075403949730703e-06,
|
||||||
|
"loss": 0.25149285793304443,
|
||||||
|
"step": 690
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.25696316262354,
|
||||||
|
"grad_norm": 0.48192131519317627,
|
||||||
|
"learning_rate": 1.862657091561939e-06,
|
||||||
|
"loss": 0.2485593795776367,
|
||||||
|
"step": 700
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.2749326145552562,
|
||||||
|
"grad_norm": 0.49026069045066833,
|
||||||
|
"learning_rate": 1.817773788150808e-06,
|
||||||
|
"loss": 0.24832606315612793,
|
||||||
|
"step": 710
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.2929020664869721,
|
||||||
|
"grad_norm": 0.46640709042549133,
|
||||||
|
"learning_rate": 1.7728904847396768e-06,
|
||||||
|
"loss": 0.2521926164627075,
|
||||||
|
"step": 720
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.310871518418688,
|
||||||
|
"grad_norm": 0.5054717063903809,
|
||||||
|
"learning_rate": 1.728007181328546e-06,
|
||||||
|
"loss": 0.25048768520355225,
|
||||||
|
"step": 730
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.3288409703504043,
|
||||||
|
"grad_norm": 0.4634091258049011,
|
||||||
|
"learning_rate": 1.683123877917415e-06,
|
||||||
|
"loss": 0.24852404594421387,
|
||||||
|
"step": 740
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.3468104222821204,
|
||||||
|
"grad_norm": 0.4614594578742981,
|
||||||
|
"learning_rate": 1.6382405745062837e-06,
|
||||||
|
"loss": 0.2514226198196411,
|
||||||
|
"step": 750
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.3647798742138364,
|
||||||
|
"grad_norm": 0.5008041262626648,
|
||||||
|
"learning_rate": 1.593357271095153e-06,
|
||||||
|
"loss": 0.2507458686828613,
|
||||||
|
"step": 760
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.3827493261455526,
|
||||||
|
"grad_norm": 0.47305938601493835,
|
||||||
|
"learning_rate": 1.5484739676840217e-06,
|
||||||
|
"loss": 0.2497103691101074,
|
||||||
|
"step": 770
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.4007187780772687,
|
||||||
|
"grad_norm": 0.5139908194541931,
|
||||||
|
"learning_rate": 1.5035906642728906e-06,
|
||||||
|
"loss": 0.24853968620300293,
|
||||||
|
"step": 780
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.4186882300089847,
|
||||||
|
"grad_norm": 0.4631156027317047,
|
||||||
|
"learning_rate": 1.4587073608617596e-06,
|
||||||
|
"loss": 0.248740816116333,
|
||||||
|
"step": 790
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.4366576819407009,
|
||||||
|
"grad_norm": 0.47681012749671936,
|
||||||
|
"learning_rate": 1.4138240574506283e-06,
|
||||||
|
"loss": 0.2534752368927002,
|
||||||
|
"step": 800
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.4546271338724168,
|
||||||
|
"grad_norm": 0.4538913667201996,
|
||||||
|
"learning_rate": 1.3689407540394975e-06,
|
||||||
|
"loss": 0.24337444305419922,
|
||||||
|
"step": 810
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.472596585804133,
|
||||||
|
"grad_norm": 0.48104986548423767,
|
||||||
|
"learning_rate": 1.3240574506283663e-06,
|
||||||
|
"loss": 0.2502609729766846,
|
||||||
|
"step": 820
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.490566037735849,
|
||||||
|
"grad_norm": 0.4610423147678375,
|
||||||
|
"learning_rate": 1.2791741472172353e-06,
|
||||||
|
"loss": 0.24652738571166993,
|
||||||
|
"step": 830
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5085354896675651,
|
||||||
|
"grad_norm": 0.4587244689464569,
|
||||||
|
"learning_rate": 1.2342908438061042e-06,
|
||||||
|
"loss": 0.25362207889556887,
|
||||||
|
"step": 840
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5265049415992813,
|
||||||
|
"grad_norm": 0.4708814322948456,
|
||||||
|
"learning_rate": 1.1894075403949732e-06,
|
||||||
|
"loss": 0.24814538955688475,
|
||||||
|
"step": 850
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5444743935309972,
|
||||||
|
"grad_norm": 0.4898167550563812,
|
||||||
|
"learning_rate": 1.144524236983842e-06,
|
||||||
|
"loss": 0.2517171621322632,
|
||||||
|
"step": 860
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5624438454627134,
|
||||||
|
"grad_norm": 0.5054773688316345,
|
||||||
|
"learning_rate": 1.0996409335727111e-06,
|
||||||
|
"loss": 0.25095720291137696,
|
||||||
|
"step": 870
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5804132973944296,
|
||||||
|
"grad_norm": 0.5150067806243896,
|
||||||
|
"learning_rate": 1.05475763016158e-06,
|
||||||
|
"loss": 0.25122294425964353,
|
||||||
|
"step": 880
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.5983827493261455,
|
||||||
|
"grad_norm": 0.44859108328819275,
|
||||||
|
"learning_rate": 1.0098743267504488e-06,
|
||||||
|
"loss": 0.24518187046051027,
|
||||||
|
"step": 890
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6163522012578615,
|
||||||
|
"grad_norm": 0.4460717737674713,
|
||||||
|
"learning_rate": 9.649910233393178e-07,
|
||||||
|
"loss": 0.25164237022399905,
|
||||||
|
"step": 900
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6343216531895777,
|
||||||
|
"grad_norm": 0.4884060323238373,
|
||||||
|
"learning_rate": 9.201077199281867e-07,
|
||||||
|
"loss": 0.2497255325317383,
|
||||||
|
"step": 910
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6522911051212938,
|
||||||
|
"grad_norm": 0.4527634084224701,
|
||||||
|
"learning_rate": 8.752244165170558e-07,
|
||||||
|
"loss": 0.2494762897491455,
|
||||||
|
"step": 920
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.6702605570530098,
|
||||||
|
"grad_norm": 0.47182497382164,
|
||||||
|
"learning_rate": 8.303411131059247e-07,
|
||||||
|
"loss": 0.24840357303619384,
|
||||||
|
"step": 930
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.688230008984726,
|
||||||
|
"grad_norm": 0.4759376347064972,
|
||||||
|
"learning_rate": 7.854578096947936e-07,
|
||||||
|
"loss": 0.25184221267700196,
|
||||||
|
"step": 940
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.7061994609164421,
|
||||||
|
"grad_norm": 0.495343416929245,
|
||||||
|
"learning_rate": 7.405745062836626e-07,
|
||||||
|
"loss": 0.25055861473083496,
|
||||||
|
"step": 950
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.724168912848158,
|
||||||
|
"grad_norm": 0.5005154609680176,
|
||||||
|
"learning_rate": 6.956912028725314e-07,
|
||||||
|
"loss": 0.25119876861572266,
|
||||||
|
"step": 960
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.742138364779874,
|
||||||
|
"grad_norm": 0.47676777839660645,
|
||||||
|
"learning_rate": 6.508078994614005e-07,
|
||||||
|
"loss": 0.2516517162322998,
|
||||||
|
"step": 970
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.7601078167115904,
|
||||||
|
"grad_norm": 0.4394581913948059,
|
||||||
|
"learning_rate": 6.059245960502694e-07,
|
||||||
|
"loss": 0.250733470916748,
|
||||||
|
"step": 980
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.7780772686433064,
|
||||||
|
"grad_norm": 0.4702657163143158,
|
||||||
|
"learning_rate": 5.610412926391383e-07,
|
||||||
|
"loss": 0.2478208065032959,
|
||||||
|
"step": 990
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.7960467205750223,
|
||||||
|
"grad_norm": 0.4843612611293793,
|
||||||
|
"learning_rate": 5.161579892280072e-07,
|
||||||
|
"loss": 0.24935145378112794,
|
||||||
|
"step": 1000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.8140161725067385,
|
||||||
|
"grad_norm": 0.4673105776309967,
|
||||||
|
"learning_rate": 4.7127468581687615e-07,
|
||||||
|
"loss": 0.24984090328216552,
|
||||||
|
"step": 1010
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.8319856244384547,
|
||||||
|
"grad_norm": 0.4820215404033661,
|
||||||
|
"learning_rate": 4.2639138240574507e-07,
|
||||||
|
"loss": 0.24917204380035402,
|
||||||
|
"step": 1020
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.8499550763701706,
|
||||||
|
"grad_norm": 0.45277148485183716,
|
||||||
|
"learning_rate": 3.815080789946141e-07,
|
||||||
|
"loss": 0.24739840030670165,
|
||||||
|
"step": 1030
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.8679245283018868,
|
||||||
|
"grad_norm": 0.48467275500297546,
|
||||||
|
"learning_rate": 3.3662477558348295e-07,
|
||||||
|
"loss": 0.24694859981536865,
|
||||||
|
"step": 1040
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.885893980233603,
|
||||||
|
"grad_norm": 0.46758314967155457,
|
||||||
|
"learning_rate": 2.917414721723519e-07,
|
||||||
|
"loss": 0.24703009128570558,
|
||||||
|
"step": 1050
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.903863432165319,
|
||||||
|
"grad_norm": 0.4634384512901306,
|
||||||
|
"learning_rate": 2.4685816876122083e-07,
|
||||||
|
"loss": 0.2495879650115967,
|
||||||
|
"step": 1060
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.921832884097035,
|
||||||
|
"grad_norm": 0.4621906578540802,
|
||||||
|
"learning_rate": 2.0197486535008978e-07,
|
||||||
|
"loss": 0.25526316165924073,
|
||||||
|
"step": 1070
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.939802336028751,
|
||||||
|
"grad_norm": 0.46646031737327576,
|
||||||
|
"learning_rate": 1.5709156193895872e-07,
|
||||||
|
"loss": 0.24767594337463378,
|
||||||
|
"step": 1080
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.9577717879604672,
|
||||||
|
"grad_norm": 0.4569203555583954,
|
||||||
|
"learning_rate": 1.1220825852782766e-07,
|
||||||
|
"loss": 0.24955098628997802,
|
||||||
|
"step": 1090
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.9757412398921832,
|
||||||
|
"grad_norm": 0.47747698426246643,
|
||||||
|
"learning_rate": 6.732495511669659e-08,
|
||||||
|
"loss": 0.2489546775817871,
|
||||||
|
"step": 1100
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 1.9937106918238994,
|
||||||
|
"grad_norm": 0.47046294808387756,
|
||||||
|
"learning_rate": 2.2441651705565532e-08,
|
||||||
|
"loss": 0.24430301189422607,
|
||||||
|
"step": 1110
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 2.0,
|
||||||
|
"step": 1114,
|
||||||
|
"total_flos": 1.4534558685629252e+19,
|
||||||
|
"train_loss": 0.27833450065266935,
|
||||||
|
"train_runtime": 6971.6374,
|
||||||
|
"train_samples_per_second": 20.435,
|
||||||
|
"train_steps_per_second": 0.16
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logging_steps": 10,
|
||||||
|
"max_steps": 1114,
|
||||||
|
"num_input_tokens_seen": 0,
|
||||||
|
"num_train_epochs": 2,
|
||||||
|
"save_steps": 500,
|
||||||
|
"stateful_callbacks": {
|
||||||
|
"TrainerControl": {
|
||||||
|
"args": {
|
||||||
|
"should_epoch_stop": false,
|
||||||
|
"should_evaluate": false,
|
||||||
|
"should_log": false,
|
||||||
|
"should_save": true,
|
||||||
|
"should_training_stop": true
|
||||||
|
},
|
||||||
|
"attributes": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_flos": 1.4534558685629252e+19,
|
||||||
|
"train_batch_size": 4,
|
||||||
|
"trial_name": null,
|
||||||
|
"trial_params": null
|
||||||
|
}
|
||||||
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:faab7ad1aa08d8dcee85b43908a1e86bb716f50fae360a7995d7e5647cad68d4
|
||||||
|
size 6776
|
||||||
BIN
training_loss.png
Normal file
BIN
training_loss.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 38 KiB |
Reference in New Issue
Block a user