初始化项目,由ModelHub XC社区提供模型
Model: W-61/mistral-7b-base-sft-hh-harmless-4xh200-batch-64 Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
67
README.md
Normal file
67
README.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
---
|
||||||
|
library_name: transformers
|
||||||
|
base_model: mistralai/Mistral-7B-v0.3
|
||||||
|
tags:
|
||||||
|
- alignment-handbook
|
||||||
|
- generated_from_trainer
|
||||||
|
datasets:
|
||||||
|
- Anthropic/hh-rlhf
|
||||||
|
model-index:
|
||||||
|
- name: mistral-7b-base-sft-hh-harmless-4xh200-batch-64-20260418-015332
|
||||||
|
results: []
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
||||||
|
should probably proofread and complete it, then remove this comment. -->
|
||||||
|
|
||||||
|
# mistral-7b-base-sft-hh-harmless-4xh200-batch-64-20260418-015332
|
||||||
|
|
||||||
|
This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the Anthropic/hh-rlhf dataset.
|
||||||
|
It achieves the following results on the evaluation set:
|
||||||
|
- Loss: 1.1678
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training and evaluation data
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training procedure
|
||||||
|
|
||||||
|
### Training hyperparameters
|
||||||
|
|
||||||
|
The following hyperparameters were used during training:
|
||||||
|
- learning_rate: 2e-05
|
||||||
|
- train_batch_size: 8
|
||||||
|
- eval_batch_size: 8
|
||||||
|
- seed: 42
|
||||||
|
- distributed_type: multi-GPU
|
||||||
|
- num_devices: 4
|
||||||
|
- gradient_accumulation_steps: 2
|
||||||
|
- total_train_batch_size: 64
|
||||||
|
- total_eval_batch_size: 32
|
||||||
|
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
||||||
|
- lr_scheduler_type: cosine
|
||||||
|
- lr_scheduler_warmup_ratio: 0.1
|
||||||
|
- num_epochs: 1
|
||||||
|
|
||||||
|
### Training results
|
||||||
|
|
||||||
|
| Training Loss | Epoch | Step | Validation Loss |
|
||||||
|
|:-------------:|:------:|:----:|:---------------:|
|
||||||
|
| 1.3645 | 0.3992 | 100 | 1.3725 |
|
||||||
|
| 1.1459 | 0.7984 | 200 | 1.1678 |
|
||||||
|
|
||||||
|
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- Transformers 4.51.0
|
||||||
|
- Pytorch 2.3.1+cu121
|
||||||
|
- Datasets 2.21.0
|
||||||
|
- Tokenizers 0.21.4
|
||||||
14
all_results.json
Normal file
14
all_results.json
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"eval_loss": 1.1445051431655884,
|
||||||
|
"eval_runtime": 4.6195,
|
||||||
|
"eval_samples": 2303,
|
||||||
|
"eval_samples_per_second": 195.476,
|
||||||
|
"eval_steps_per_second": 6.278,
|
||||||
|
"total_flos": 8.741444925364634e+16,
|
||||||
|
"train_loss": 1.2971151485443115,
|
||||||
|
"train_runtime": 891.5874,
|
||||||
|
"train_samples": 42336,
|
||||||
|
"train_samples_per_second": 17.966,
|
||||||
|
"train_steps_per_second": 0.28
|
||||||
|
}
|
||||||
26
config.json
Normal file
26
config.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"MistralForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 14336,
|
||||||
|
"max_position_embeddings": 32768,
|
||||||
|
"model_type": "mistral",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 32,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_theta": 1000000.0,
|
||||||
|
"sliding_window": null,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "float32",
|
||||||
|
"transformers_version": "4.51.0",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 32768
|
||||||
|
}
|
||||||
8
eval_results.json
Normal file
8
eval_results.json
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"eval_loss": 1.1445051431655884,
|
||||||
|
"eval_runtime": 4.6195,
|
||||||
|
"eval_samples": 2303,
|
||||||
|
"eval_samples_per_second": 195.476,
|
||||||
|
"eval_steps_per_second": 6.278
|
||||||
|
}
|
||||||
6
generation_config.json
Normal file
6
generation_config.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"transformers_version": "4.51.0"
|
||||||
|
}
|
||||||
3
model-00001-of-00006.safetensors
Normal file
3
model-00001-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:937f97c3d39b582912ed0dbe6a71067f3134affc5de7490ac46e3305cb43bc72
|
||||||
|
size 4999779856
|
||||||
3
model-00002-of-00006.safetensors
Normal file
3
model-00002-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:2d6c47686f13d7b3a93855b0339e6320fe7c294ca36cff1d72d145a6511c51b2
|
||||||
|
size 4899116440
|
||||||
3
model-00003-of-00006.safetensors
Normal file
3
model-00003-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0e202f993edb76ad5fb9333c934b2e855bc215bd4bd24119152f62d6c7224009
|
||||||
|
size 4999813120
|
||||||
3
model-00004-of-00006.safetensors
Normal file
3
model-00004-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d7ee4c1680e7c4b867e6311f88bfb3ea29323c6a6100ab3e91b7275727656f19
|
||||||
|
size 4999813128
|
||||||
3
model-00005-of-00006.safetensors
Normal file
3
model-00005-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:4a1b3da8938e61aff4366d683554b12518c3f2a34e97b34118470b44907a7cb5
|
||||||
|
size 4832007496
|
||||||
3
model-00006-of-00006.safetensors
Normal file
3
model-00006-of-00006.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:dcf5816047009123f1babf9129872302564f5b56f7af376320a2282d4a276ce1
|
||||||
|
size 4261597808
|
||||||
298
model.safetensors.index.json
Normal file
298
model.safetensors.index.json
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_size": 28992094208
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.10.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.10.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.16.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.16.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
||||||
|
"model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.20.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.22.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
||||||
|
"model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.input_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
||||||
|
"model.layers.28.input_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.30.input_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.input_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
|
||||||
|
"model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
||||||
|
"model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
||||||
|
"model.norm.weight": "model-00006-of-00006.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
24
special_tokens_map.json
Normal file
24
special_tokens_map.json
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": "</s>",
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
275733
tokenizer.json
Normal file
275733
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
3
tokenizer.model
Normal file
3
tokenizer.model
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
|
||||||
|
size 587404
|
||||||
6188
tokenizer_config.json
Normal file
6188
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
9
train_results.json
Normal file
9
train_results.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"total_flos": 8.741444925364634e+16,
|
||||||
|
"train_loss": 1.2971151485443115,
|
||||||
|
"train_runtime": 891.5874,
|
||||||
|
"train_samples": 42336,
|
||||||
|
"train_samples_per_second": 17.966,
|
||||||
|
"train_steps_per_second": 0.28
|
||||||
|
}
|
||||||
416
trainer_state.json
Normal file
416
trainer_state.json
Normal file
@@ -0,0 +1,416 @@
|
|||||||
|
{
|
||||||
|
"best_global_step": null,
|
||||||
|
"best_metric": null,
|
||||||
|
"best_model_checkpoint": null,
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"eval_steps": 100,
|
||||||
|
"global_step": 250,
|
||||||
|
"is_hyper_param_search": false,
|
||||||
|
"is_local_process_zero": true,
|
||||||
|
"is_world_process_zero": true,
|
||||||
|
"log_history": [
|
||||||
|
{
|
||||||
|
"epoch": 0.003992015968063872,
|
||||||
|
"grad_norm": 71.5754165649414,
|
||||||
|
"learning_rate": 0.0,
|
||||||
|
"loss": 2.0095,
|
||||||
|
"step": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.01996007984031936,
|
||||||
|
"grad_norm": 25.808263778686523,
|
||||||
|
"learning_rate": 3.2000000000000003e-06,
|
||||||
|
"loss": 1.8226,
|
||||||
|
"step": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.03992015968063872,
|
||||||
|
"grad_norm": 10.55435562133789,
|
||||||
|
"learning_rate": 7.2000000000000005e-06,
|
||||||
|
"loss": 1.5338,
|
||||||
|
"step": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.059880239520958084,
|
||||||
|
"grad_norm": 13.148735046386719,
|
||||||
|
"learning_rate": 1.1200000000000001e-05,
|
||||||
|
"loss": 1.4367,
|
||||||
|
"step": 15
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.07984031936127745,
|
||||||
|
"grad_norm": 6.636435031890869,
|
||||||
|
"learning_rate": 1.5200000000000002e-05,
|
||||||
|
"loss": 1.444,
|
||||||
|
"step": 20
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.0998003992015968,
|
||||||
|
"grad_norm": 4.353740215301514,
|
||||||
|
"learning_rate": 1.9200000000000003e-05,
|
||||||
|
"loss": 1.4407,
|
||||||
|
"step": 25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.11976047904191617,
|
||||||
|
"grad_norm": 3.963563919067383,
|
||||||
|
"learning_rate": 1.9984407641819812e-05,
|
||||||
|
"loss": 1.4644,
|
||||||
|
"step": 30
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.13972055888223553,
|
||||||
|
"grad_norm": 4.042232036590576,
|
||||||
|
"learning_rate": 1.9921147013144782e-05,
|
||||||
|
"loss": 1.4582,
|
||||||
|
"step": 35
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.1596806387225549,
|
||||||
|
"grad_norm": 3.6070656776428223,
|
||||||
|
"learning_rate": 1.9809551553491918e-05,
|
||||||
|
"loss": 1.461,
|
||||||
|
"step": 40
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.17964071856287425,
|
||||||
|
"grad_norm": 3.843057632446289,
|
||||||
|
"learning_rate": 1.9650164944723116e-05,
|
||||||
|
"loss": 1.4496,
|
||||||
|
"step": 45
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.1996007984031936,
|
||||||
|
"grad_norm": 3.784003734588623,
|
||||||
|
"learning_rate": 1.944376370237481e-05,
|
||||||
|
"loss": 1.4632,
|
||||||
|
"step": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.21956087824351297,
|
||||||
|
"grad_norm": 3.471970319747925,
|
||||||
|
"learning_rate": 1.9191353392552346e-05,
|
||||||
|
"loss": 1.4363,
|
||||||
|
"step": 55
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.23952095808383234,
|
||||||
|
"grad_norm": 3.609161615371704,
|
||||||
|
"learning_rate": 1.889416373291298e-05,
|
||||||
|
"loss": 1.4209,
|
||||||
|
"step": 60
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.25948103792415167,
|
||||||
|
"grad_norm": 3.706693649291992,
|
||||||
|
"learning_rate": 1.855364260160507e-05,
|
||||||
|
"loss": 1.3991,
|
||||||
|
"step": 65
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.27944111776447106,
|
||||||
|
"grad_norm": 3.828991174697876,
|
||||||
|
"learning_rate": 1.8171448983351284e-05,
|
||||||
|
"loss": 1.4168,
|
||||||
|
"step": 70
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.2994011976047904,
|
||||||
|
"grad_norm": 3.53777813911438,
|
||||||
|
"learning_rate": 1.7749444887041797e-05,
|
||||||
|
"loss": 1.4197,
|
||||||
|
"step": 75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3193612774451098,
|
||||||
|
"grad_norm": 3.46360182762146,
|
||||||
|
"learning_rate": 1.7289686274214116e-05,
|
||||||
|
"loss": 1.4041,
|
||||||
|
"step": 80
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3393213572854291,
|
||||||
|
"grad_norm": 3.3420891761779785,
|
||||||
|
"learning_rate": 1.6794413042615168e-05,
|
||||||
|
"loss": 1.361,
|
||||||
|
"step": 85
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3592814371257485,
|
||||||
|
"grad_norm": 3.3036203384399414,
|
||||||
|
"learning_rate": 1.6266038113644605e-05,
|
||||||
|
"loss": 1.3671,
|
||||||
|
"step": 90
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.37924151696606784,
|
||||||
|
"grad_norm": 3.4878897666931152,
|
||||||
|
"learning_rate": 1.570713567684432e-05,
|
||||||
|
"loss": 1.346,
|
||||||
|
"step": 95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3992015968063872,
|
||||||
|
"grad_norm": 4.090396404266357,
|
||||||
|
"learning_rate": 1.5120428648705716e-05,
|
||||||
|
"loss": 1.3645,
|
||||||
|
"step": 100
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.3992015968063872,
|
||||||
|
"eval_loss": 1.3725436925888062,
|
||||||
|
"eval_runtime": 4.6422,
|
||||||
|
"eval_samples_per_second": 194.519,
|
||||||
|
"eval_steps_per_second": 6.247,
|
||||||
|
"step": 100
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.41916167664670656,
|
||||||
|
"grad_norm": 3.2958004474639893,
|
||||||
|
"learning_rate": 1.4508775406894308e-05,
|
||||||
|
"loss": 1.3203,
|
||||||
|
"step": 105
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.43912175648702595,
|
||||||
|
"grad_norm": 3.205641746520996,
|
||||||
|
"learning_rate": 1.3875155864521031e-05,
|
||||||
|
"loss": 1.3251,
|
||||||
|
"step": 110
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.4590818363273453,
|
||||||
|
"grad_norm": 3.419351100921631,
|
||||||
|
"learning_rate": 1.3222656952305113e-05,
|
||||||
|
"loss": 1.3093,
|
||||||
|
"step": 115
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.47904191616766467,
|
||||||
|
"grad_norm": 3.5063862800598145,
|
||||||
|
"learning_rate": 1.2554457579357906e-05,
|
||||||
|
"loss": 1.297,
|
||||||
|
"step": 120
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.499001996007984,
|
||||||
|
"grad_norm": 3.2938807010650635,
|
||||||
|
"learning_rate": 1.187381314585725e-05,
|
||||||
|
"loss": 1.2889,
|
||||||
|
"step": 125
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5189620758483033,
|
||||||
|
"grad_norm": 3.2896780967712402,
|
||||||
|
"learning_rate": 1.1184039683065014e-05,
|
||||||
|
"loss": 1.2707,
|
||||||
|
"step": 130
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5389221556886228,
|
||||||
|
"grad_norm": 3.1759278774261475,
|
||||||
|
"learning_rate": 1.0488497697956134e-05,
|
||||||
|
"loss": 1.2518,
|
||||||
|
"step": 135
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5588822355289421,
|
||||||
|
"grad_norm": 3.616849422454834,
|
||||||
|
"learning_rate": 9.790575801166432e-06,
|
||||||
|
"loss": 1.2737,
|
||||||
|
"step": 140
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5788423153692615,
|
||||||
|
"grad_norm": 3.459834098815918,
|
||||||
|
"learning_rate": 9.093674198022201e-06,
|
||||||
|
"loss": 1.2496,
|
||||||
|
"step": 145
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.5988023952095808,
|
||||||
|
"grad_norm": 3.072103261947632,
|
||||||
|
"learning_rate": 8.401188123081653e-06,
|
||||||
|
"loss": 1.2129,
|
||||||
|
"step": 150
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6187624750499002,
|
||||||
|
"grad_norm": 3.2528676986694336,
|
||||||
|
"learning_rate": 7.716491298893443e-06,
|
||||||
|
"loss": 1.2096,
|
||||||
|
"step": 155
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6387225548902196,
|
||||||
|
"grad_norm": 3.041900157928467,
|
||||||
|
"learning_rate": 7.042919499559538e-06,
|
||||||
|
"loss": 1.2171,
|
||||||
|
"step": 160
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6586826347305389,
|
||||||
|
"grad_norm": 3.830709457397461,
|
||||||
|
"learning_rate": 6.383754299179079e-06,
|
||||||
|
"loss": 1.2038,
|
||||||
|
"step": 165
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6786427145708582,
|
||||||
|
"grad_norm": 3.1818060874938965,
|
||||||
|
"learning_rate": 5.742207084349274e-06,
|
||||||
|
"loss": 1.1999,
|
||||||
|
"step": 170
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.6986027944111777,
|
||||||
|
"grad_norm": 3.237358331680298,
|
||||||
|
"learning_rate": 5.121403408612672e-06,
|
||||||
|
"loss": 1.1821,
|
||||||
|
"step": 175
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.718562874251497,
|
||||||
|
"grad_norm": 3.207139015197754,
|
||||||
|
"learning_rate": 4.524367765074499e-06,
|
||||||
|
"loss": 1.1617,
|
||||||
|
"step": 180
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7385229540918163,
|
||||||
|
"grad_norm": 3.0992743968963623,
|
||||||
|
"learning_rate": 3.954008851376252e-06,
|
||||||
|
"loss": 1.1629,
|
||||||
|
"step": 185
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7584830339321357,
|
||||||
|
"grad_norm": 3.1126255989074707,
|
||||||
|
"learning_rate": 3.4131053988131947e-06,
|
||||||
|
"loss": 1.1688,
|
||||||
|
"step": 190
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7784431137724551,
|
||||||
|
"grad_norm": 3.3172667026519775,
|
||||||
|
"learning_rate": 2.9042926346347932e-06,
|
||||||
|
"loss": 1.1507,
|
||||||
|
"step": 195
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7984031936127745,
|
||||||
|
"grad_norm": 3.125807762145996,
|
||||||
|
"learning_rate": 2.4300494434824373e-06,
|
||||||
|
"loss": 1.1459,
|
||||||
|
"step": 200
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.7984031936127745,
|
||||||
|
"eval_loss": 1.1677805185317993,
|
||||||
|
"eval_runtime": 4.6292,
|
||||||
|
"eval_samples_per_second": 195.067,
|
||||||
|
"eval_steps_per_second": 6.265,
|
||||||
|
"step": 200
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8183632734530938,
|
||||||
|
"grad_norm": 3.1806719303131104,
|
||||||
|
"learning_rate": 1.9926862905126663e-06,
|
||||||
|
"loss": 1.1508,
|
||||||
|
"step": 205
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8383233532934131,
|
||||||
|
"grad_norm": 3.2433359622955322,
|
||||||
|
"learning_rate": 1.5943339650431578e-06,
|
||||||
|
"loss": 1.1156,
|
||||||
|
"step": 210
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8582834331337326,
|
||||||
|
"grad_norm": 3.1037845611572266,
|
||||||
|
"learning_rate": 1.2369331995613664e-06,
|
||||||
|
"loss": 1.1278,
|
||||||
|
"step": 215
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8782435129740519,
|
||||||
|
"grad_norm": 3.121793270111084,
|
||||||
|
"learning_rate": 9.222252146709143e-07,
|
||||||
|
"loss": 1.1291,
|
||||||
|
"step": 220
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.8982035928143712,
|
||||||
|
"grad_norm": 3.311478614807129,
|
||||||
|
"learning_rate": 6.517432360398556e-07,
|
||||||
|
"loss": 1.1606,
|
||||||
|
"step": 225
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9181636726546906,
|
||||||
|
"grad_norm": 3.1572906970977783,
|
||||||
|
"learning_rate": 4.268050246793276e-07,
|
||||||
|
"loss": 1.1376,
|
||||||
|
"step": 230
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.93812375249501,
|
||||||
|
"grad_norm": 3.125819683074951,
|
||||||
|
"learning_rate": 2.4850645694436736e-07,
|
||||||
|
"loss": 1.1042,
|
||||||
|
"step": 235
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9580838323353293,
|
||||||
|
"grad_norm": 3.240495443344116,
|
||||||
|
"learning_rate": 1.1771618553447217e-07,
|
||||||
|
"loss": 1.1349,
|
||||||
|
"step": 240
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.9780439121756487,
|
||||||
|
"grad_norm": 3.0710411071777344,
|
||||||
|
"learning_rate": 3.50714075049563e-08,
|
||||||
|
"loss": 1.1139,
|
||||||
|
"step": 245
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"grad_norm": 3.2199409008026123,
|
||||||
|
"learning_rate": 9.74759906957612e-10,
|
||||||
|
"loss": 1.1324,
|
||||||
|
"step": 250
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.998003992015968,
|
||||||
|
"step": 250,
|
||||||
|
"total_flos": 8.741444925364634e+16,
|
||||||
|
"train_loss": 1.2971151485443115,
|
||||||
|
"train_runtime": 891.5874,
|
||||||
|
"train_samples_per_second": 17.966,
|
||||||
|
"train_steps_per_second": 0.28
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logging_steps": 5,
|
||||||
|
"max_steps": 250,
|
||||||
|
"num_input_tokens_seen": 0,
|
||||||
|
"num_train_epochs": 1,
|
||||||
|
"save_steps": 200,
|
||||||
|
"stateful_callbacks": {
|
||||||
|
"TrainerControl": {
|
||||||
|
"args": {
|
||||||
|
"should_epoch_stop": false,
|
||||||
|
"should_evaluate": false,
|
||||||
|
"should_log": false,
|
||||||
|
"should_save": true,
|
||||||
|
"should_training_stop": true
|
||||||
|
},
|
||||||
|
"attributes": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_flos": 8.741444925364634e+16,
|
||||||
|
"train_batch_size": 8,
|
||||||
|
"trial_name": null,
|
||||||
|
"trial_params": null
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user