初始化项目,由ModelHub XC社区提供模型
Model: Joshua-Sun-CompSci/GPT-2_academic_style_tune Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
49
README.md
Normal file
49
README.md
Normal file
@@ -0,0 +1,49 @@
|
||||
---
|
||||
license: mit
|
||||
language:
|
||||
- en
|
||||
base_model:
|
||||
- gpt2
|
||||
pipeline_tag: text-generation
|
||||
tags:
|
||||
- academic-writing
|
||||
- gpt2
|
||||
- fine-tuning
|
||||
- arxiv
|
||||
- llm
|
||||
- uc-davis
|
||||
---
|
||||
|
||||
# GPT-2 Fine-Tuned for Academic Style Writing
|
||||
|
||||
This model is a fine-tuned version of GPT-2 (124M), trained on 500,000 academic abstracts from the arXiv dataset. It was developed as part of the ECS 271 final project at UC Davis.
|
||||
|
||||
## Intended Use
|
||||
|
||||
Designed to generate text with academic tone and structure for use in educational and research-oriented applications such as writing assistants or tutors.
|
||||
|
||||
## Training Details
|
||||
|
||||
- Base model: `gpt2`
|
||||
- Method: Full-parameter fine-tuning
|
||||
- Dataset: 500K arXiv abstracts
|
||||
- Epochs: 1.2
|
||||
- GPU: RTX 5070 Ti (~2 hours)
|
||||
|
||||
## Limitations
|
||||
|
||||
- May hallucinate facts
|
||||
- Not citation-aware
|
||||
- English-only
|
||||
- No factual grounding
|
||||
|
||||
## Example
|
||||
|
||||
**Prompt:**
|
||||
"Transformer models have impacted the field of education by..."
|
||||
|
||||
**Output:**
|
||||
"Transformer models have impacted the field of education by enabling new forms of intelligent tutoring systems and automatic curriculum generation. Their ability to understand context has allowed for more personalized and scalable academic tools."
|
||||
|
||||
## More Info
|
||||
Main Project: [Github](https://github.com/Joshua-Sun-CompSci/academic-style-llms)
|
||||
38
config.json
Normal file
38
config.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"activation_function": "gelu_new",
|
||||
"architectures": [
|
||||
"GPT2LMHeadModel"
|
||||
],
|
||||
"attn_pdrop": 0.1,
|
||||
"bos_token_id": 50256,
|
||||
"embd_pdrop": 0.1,
|
||||
"eos_token_id": 50256,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_epsilon": 1e-05,
|
||||
"model_type": "gpt2",
|
||||
"n_ctx": 1024,
|
||||
"n_embd": 768,
|
||||
"n_head": 12,
|
||||
"n_inner": null,
|
||||
"n_layer": 12,
|
||||
"n_positions": 1024,
|
||||
"reorder_and_upcast_attn": false,
|
||||
"resid_pdrop": 0.1,
|
||||
"scale_attn_by_inverse_layer_idx": false,
|
||||
"scale_attn_weights": true,
|
||||
"summary_activation": null,
|
||||
"summary_first_dropout": 0.1,
|
||||
"summary_proj_to_labels": true,
|
||||
"summary_type": "cls_index",
|
||||
"summary_use_proj": true,
|
||||
"task_specific_params": {
|
||||
"text-generation": {
|
||||
"do_sample": true,
|
||||
"max_length": 50
|
||||
}
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.52.1",
|
||||
"use_cache": true,
|
||||
"vocab_size": 50257
|
||||
}
|
||||
6
generation_config.json
Normal file
6
generation_config.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 50256,
|
||||
"eos_token_id": 50256,
|
||||
"transformers_version": "4.52.1"
|
||||
}
|
||||
50001
merges.txt
Normal file
50001
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:07ff07d84ee8c2e2f15d11317364666a957604ed3ad39dc08ade92d4ce772898
|
||||
size 497774208
|
||||
3
optimizer.pt
Normal file
3
optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:23bf795bebb4f351c5bd4002c8007ee0937df58155bedc8d3547e26c49fbb182
|
||||
size 995642763
|
||||
3
rng_state.pth
Normal file
3
rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bc44c7aa517691c7db52076f98339a01fc057206d8d256e942c358ae19833245
|
||||
size 14645
|
||||
3
scaler.pt
Normal file
3
scaler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f6be6948435bad5d6f12984605648954670ddc894fd0cb1e7583c339811c85e4
|
||||
size 1383
|
||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:697e825566a5b120310b7236ac2aabc2ae52ecab7873727faf6b175b593a89fb
|
||||
size 1465
|
||||
24
special_tokens_map.json
Normal file
24
special_tokens_map.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": "<|endoftext|>",
|
||||
"unk_token": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
23
tokenizer_config.json
Normal file
23
tokenizer_config.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"add_bos_token": false,
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"50256": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<|endoftext|>",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"errors": "replace",
|
||||
"extra_special_tokens": {},
|
||||
"model_max_length": 1024,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"unk_token": "<|endoftext|>"
|
||||
}
|
||||
874
trainer_state.json
Normal file
874
trainer_state.json
Normal file
@@ -0,0 +1,874 @@
|
||||
{
|
||||
"best_global_step": null,
|
||||
"best_metric": null,
|
||||
"best_model_checkpoint": null,
|
||||
"epoch": 1.20870265914585,
|
||||
"eval_steps": 500,
|
||||
"global_step": 12000,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"epoch": 0.010072522159548751,
|
||||
"grad_norm": 2.187725782394409,
|
||||
"learning_rate": 4.975322320709106e-05,
|
||||
"loss": 3.892,
|
||||
"step": 100
|
||||
},
|
||||
{
|
||||
"epoch": 0.020145044319097503,
|
||||
"grad_norm": 1.7743529081344604,
|
||||
"learning_rate": 4.950141015310234e-05,
|
||||
"loss": 3.7714,
|
||||
"step": 200
|
||||
},
|
||||
{
|
||||
"epoch": 0.030217566478646252,
|
||||
"grad_norm": 1.555106520652771,
|
||||
"learning_rate": 4.924959709911362e-05,
|
||||
"loss": 3.7284,
|
||||
"step": 300
|
||||
},
|
||||
{
|
||||
"epoch": 0.040290088638195005,
|
||||
"grad_norm": 1.5166494846343994,
|
||||
"learning_rate": 4.89977840451249e-05,
|
||||
"loss": 3.691,
|
||||
"step": 400
|
||||
},
|
||||
{
|
||||
"epoch": 0.050362610797743755,
|
||||
"grad_norm": 1.30672025680542,
|
||||
"learning_rate": 4.874597099113618e-05,
|
||||
"loss": 3.6623,
|
||||
"step": 500
|
||||
},
|
||||
{
|
||||
"epoch": 0.060435132957292505,
|
||||
"grad_norm": 1.320106863975525,
|
||||
"learning_rate": 4.8494157937147464e-05,
|
||||
"loss": 3.6443,
|
||||
"step": 600
|
||||
},
|
||||
{
|
||||
"epoch": 0.07050765511684126,
|
||||
"grad_norm": 1.256370186805725,
|
||||
"learning_rate": 4.8242344883158744e-05,
|
||||
"loss": 3.6066,
|
||||
"step": 700
|
||||
},
|
||||
{
|
||||
"epoch": 0.08058017727639001,
|
||||
"grad_norm": 1.2294926643371582,
|
||||
"learning_rate": 4.7990531829170024e-05,
|
||||
"loss": 3.5984,
|
||||
"step": 800
|
||||
},
|
||||
{
|
||||
"epoch": 0.09065269943593876,
|
||||
"grad_norm": 1.1439231634140015,
|
||||
"learning_rate": 4.7738718775181304e-05,
|
||||
"loss": 3.5885,
|
||||
"step": 900
|
||||
},
|
||||
{
|
||||
"epoch": 0.10072522159548751,
|
||||
"grad_norm": 1.132494330406189,
|
||||
"learning_rate": 4.7486905721192584e-05,
|
||||
"loss": 3.5649,
|
||||
"step": 1000
|
||||
},
|
||||
{
|
||||
"epoch": 0.11079774375503626,
|
||||
"grad_norm": 1.2308531999588013,
|
||||
"learning_rate": 4.723509266720387e-05,
|
||||
"loss": 3.5437,
|
||||
"step": 1100
|
||||
},
|
||||
{
|
||||
"epoch": 0.12087026591458501,
|
||||
"grad_norm": 1.0526450872421265,
|
||||
"learning_rate": 4.698327961321515e-05,
|
||||
"loss": 3.5426,
|
||||
"step": 1200
|
||||
},
|
||||
{
|
||||
"epoch": 0.13094278807413376,
|
||||
"grad_norm": 1.063685655593872,
|
||||
"learning_rate": 4.673146655922643e-05,
|
||||
"loss": 3.5359,
|
||||
"step": 1300
|
||||
},
|
||||
{
|
||||
"epoch": 0.14101531023368252,
|
||||
"grad_norm": 1.0278949737548828,
|
||||
"learning_rate": 4.647965350523771e-05,
|
||||
"loss": 3.5286,
|
||||
"step": 1400
|
||||
},
|
||||
{
|
||||
"epoch": 0.15108783239323126,
|
||||
"grad_norm": 1.0139474868774414,
|
||||
"learning_rate": 4.622784045124899e-05,
|
||||
"loss": 3.5071,
|
||||
"step": 1500
|
||||
},
|
||||
{
|
||||
"epoch": 0.16116035455278002,
|
||||
"grad_norm": 1.0838497877120972,
|
||||
"learning_rate": 4.597602739726028e-05,
|
||||
"loss": 3.5132,
|
||||
"step": 1600
|
||||
},
|
||||
{
|
||||
"epoch": 0.17123287671232876,
|
||||
"grad_norm": 0.9688405394554138,
|
||||
"learning_rate": 4.572421434327156e-05,
|
||||
"loss": 3.496,
|
||||
"step": 1700
|
||||
},
|
||||
{
|
||||
"epoch": 0.18130539887187752,
|
||||
"grad_norm": 0.9470923542976379,
|
||||
"learning_rate": 4.547240128928284e-05,
|
||||
"loss": 3.4971,
|
||||
"step": 1800
|
||||
},
|
||||
{
|
||||
"epoch": 0.19137792103142626,
|
||||
"grad_norm": 0.9688748121261597,
|
||||
"learning_rate": 4.522058823529412e-05,
|
||||
"loss": 3.4827,
|
||||
"step": 1900
|
||||
},
|
||||
{
|
||||
"epoch": 0.20145044319097502,
|
||||
"grad_norm": 0.9359450936317444,
|
||||
"learning_rate": 4.49687751813054e-05,
|
||||
"loss": 3.4771,
|
||||
"step": 2000
|
||||
},
|
||||
{
|
||||
"epoch": 0.21152296535052378,
|
||||
"grad_norm": 0.9809269309043884,
|
||||
"learning_rate": 4.4716962127316684e-05,
|
||||
"loss": 3.4685,
|
||||
"step": 2100
|
||||
},
|
||||
{
|
||||
"epoch": 0.22159548751007252,
|
||||
"grad_norm": 0.9238491058349609,
|
||||
"learning_rate": 4.4465149073327964e-05,
|
||||
"loss": 3.4603,
|
||||
"step": 2200
|
||||
},
|
||||
{
|
||||
"epoch": 0.23166800966962128,
|
||||
"grad_norm": 0.9273353815078735,
|
||||
"learning_rate": 4.4213336019339244e-05,
|
||||
"loss": 3.4639,
|
||||
"step": 2300
|
||||
},
|
||||
{
|
||||
"epoch": 0.24174053182917002,
|
||||
"grad_norm": 0.9298893809318542,
|
||||
"learning_rate": 4.3961522965350524e-05,
|
||||
"loss": 3.4526,
|
||||
"step": 2400
|
||||
},
|
||||
{
|
||||
"epoch": 0.2518130539887188,
|
||||
"grad_norm": 0.9529768824577332,
|
||||
"learning_rate": 4.370970991136181e-05,
|
||||
"loss": 3.4473,
|
||||
"step": 2500
|
||||
},
|
||||
{
|
||||
"epoch": 0.2618855761482675,
|
||||
"grad_norm": 0.9361433982849121,
|
||||
"learning_rate": 4.345789685737309e-05,
|
||||
"loss": 3.4397,
|
||||
"step": 2600
|
||||
},
|
||||
{
|
||||
"epoch": 0.27195809830781625,
|
||||
"grad_norm": 0.913864254951477,
|
||||
"learning_rate": 4.320608380338437e-05,
|
||||
"loss": 3.4421,
|
||||
"step": 2700
|
||||
},
|
||||
{
|
||||
"epoch": 0.28203062046736505,
|
||||
"grad_norm": 0.9004995226860046,
|
||||
"learning_rate": 4.295427074939565e-05,
|
||||
"loss": 3.4216,
|
||||
"step": 2800
|
||||
},
|
||||
{
|
||||
"epoch": 0.2921031426269138,
|
||||
"grad_norm": 0.9446254968643188,
|
||||
"learning_rate": 4.270245769540693e-05,
|
||||
"loss": 3.4334,
|
||||
"step": 2900
|
||||
},
|
||||
{
|
||||
"epoch": 0.3021756647864625,
|
||||
"grad_norm": 0.902070939540863,
|
||||
"learning_rate": 4.245064464141822e-05,
|
||||
"loss": 3.4257,
|
||||
"step": 3000
|
||||
},
|
||||
{
|
||||
"epoch": 0.3122481869460113,
|
||||
"grad_norm": 0.8563255071640015,
|
||||
"learning_rate": 4.21988315874295e-05,
|
||||
"loss": 3.4336,
|
||||
"step": 3100
|
||||
},
|
||||
{
|
||||
"epoch": 0.32232070910556004,
|
||||
"grad_norm": 0.8490210175514221,
|
||||
"learning_rate": 4.194701853344078e-05,
|
||||
"loss": 3.4212,
|
||||
"step": 3200
|
||||
},
|
||||
{
|
||||
"epoch": 0.3323932312651088,
|
||||
"grad_norm": 0.8508225679397583,
|
||||
"learning_rate": 4.169520547945206e-05,
|
||||
"loss": 3.4186,
|
||||
"step": 3300
|
||||
},
|
||||
{
|
||||
"epoch": 0.3424657534246575,
|
||||
"grad_norm": 0.8672498464584351,
|
||||
"learning_rate": 4.144339242546334e-05,
|
||||
"loss": 3.4149,
|
||||
"step": 3400
|
||||
},
|
||||
{
|
||||
"epoch": 0.3525382755842063,
|
||||
"grad_norm": 0.9109395146369934,
|
||||
"learning_rate": 4.119157937147462e-05,
|
||||
"loss": 3.4027,
|
||||
"step": 3500
|
||||
},
|
||||
{
|
||||
"epoch": 0.36261079774375504,
|
||||
"grad_norm": 0.9174132347106934,
|
||||
"learning_rate": 4.09397663174859e-05,
|
||||
"loss": 3.4152,
|
||||
"step": 3600
|
||||
},
|
||||
{
|
||||
"epoch": 0.3726833199033038,
|
||||
"grad_norm": 0.8484577536582947,
|
||||
"learning_rate": 4.068795326349718e-05,
|
||||
"loss": 3.3936,
|
||||
"step": 3700
|
||||
},
|
||||
{
|
||||
"epoch": 0.3827558420628525,
|
||||
"grad_norm": 0.8233897089958191,
|
||||
"learning_rate": 4.043614020950846e-05,
|
||||
"loss": 3.3997,
|
||||
"step": 3800
|
||||
},
|
||||
{
|
||||
"epoch": 0.3928283642224013,
|
||||
"grad_norm": 0.8456436991691589,
|
||||
"learning_rate": 4.018432715551974e-05,
|
||||
"loss": 3.3947,
|
||||
"step": 3900
|
||||
},
|
||||
{
|
||||
"epoch": 0.40290088638195004,
|
||||
"grad_norm": 0.8862765431404114,
|
||||
"learning_rate": 3.993251410153103e-05,
|
||||
"loss": 3.3872,
|
||||
"step": 4000
|
||||
},
|
||||
{
|
||||
"epoch": 0.4129734085414988,
|
||||
"grad_norm": 0.8666160106658936,
|
||||
"learning_rate": 3.968070104754231e-05,
|
||||
"loss": 3.3917,
|
||||
"step": 4100
|
||||
},
|
||||
{
|
||||
"epoch": 0.42304593070104757,
|
||||
"grad_norm": 0.8640286922454834,
|
||||
"learning_rate": 3.942888799355359e-05,
|
||||
"loss": 3.3808,
|
||||
"step": 4200
|
||||
},
|
||||
{
|
||||
"epoch": 0.4331184528605963,
|
||||
"grad_norm": 0.874873161315918,
|
||||
"learning_rate": 3.917707493956487e-05,
|
||||
"loss": 3.376,
|
||||
"step": 4300
|
||||
},
|
||||
{
|
||||
"epoch": 0.44319097502014504,
|
||||
"grad_norm": 0.8452139496803284,
|
||||
"learning_rate": 3.892526188557615e-05,
|
||||
"loss": 3.3868,
|
||||
"step": 4400
|
||||
},
|
||||
{
|
||||
"epoch": 0.4532634971796938,
|
||||
"grad_norm": 0.9129024147987366,
|
||||
"learning_rate": 3.8673448831587436e-05,
|
||||
"loss": 3.377,
|
||||
"step": 4500
|
||||
},
|
||||
{
|
||||
"epoch": 0.46333601933924257,
|
||||
"grad_norm": 0.8627080321311951,
|
||||
"learning_rate": 3.8421635777598716e-05,
|
||||
"loss": 3.3755,
|
||||
"step": 4600
|
||||
},
|
||||
{
|
||||
"epoch": 0.4734085414987913,
|
||||
"grad_norm": 0.8462901711463928,
|
||||
"learning_rate": 3.8169822723609996e-05,
|
||||
"loss": 3.3718,
|
||||
"step": 4700
|
||||
},
|
||||
{
|
||||
"epoch": 0.48348106365834004,
|
||||
"grad_norm": 0.8301746249198914,
|
||||
"learning_rate": 3.7918009669621276e-05,
|
||||
"loss": 3.3731,
|
||||
"step": 4800
|
||||
},
|
||||
{
|
||||
"epoch": 0.4935535858178888,
|
||||
"grad_norm": 0.8276001214981079,
|
||||
"learning_rate": 3.7666196615632556e-05,
|
||||
"loss": 3.3711,
|
||||
"step": 4900
|
||||
},
|
||||
{
|
||||
"epoch": 0.5036261079774376,
|
||||
"grad_norm": 0.8247309327125549,
|
||||
"learning_rate": 3.741438356164384e-05,
|
||||
"loss": 3.3684,
|
||||
"step": 5000
|
||||
},
|
||||
{
|
||||
"epoch": 0.5136986301369864,
|
||||
"grad_norm": 0.8287461400032043,
|
||||
"learning_rate": 3.716257050765512e-05,
|
||||
"loss": 3.3692,
|
||||
"step": 5100
|
||||
},
|
||||
{
|
||||
"epoch": 0.523771152296535,
|
||||
"grad_norm": 0.8390397429466248,
|
||||
"learning_rate": 3.69107574536664e-05,
|
||||
"loss": 3.3606,
|
||||
"step": 5200
|
||||
},
|
||||
{
|
||||
"epoch": 0.5338436744560838,
|
||||
"grad_norm": 0.814963161945343,
|
||||
"learning_rate": 3.665894439967768e-05,
|
||||
"loss": 3.3622,
|
||||
"step": 5300
|
||||
},
|
||||
{
|
||||
"epoch": 0.5439161966156325,
|
||||
"grad_norm": 0.8671918511390686,
|
||||
"learning_rate": 3.640713134568896e-05,
|
||||
"loss": 3.3495,
|
||||
"step": 5400
|
||||
},
|
||||
{
|
||||
"epoch": 0.5539887187751813,
|
||||
"grad_norm": 0.8633044958114624,
|
||||
"learning_rate": 3.615531829170024e-05,
|
||||
"loss": 3.3521,
|
||||
"step": 5500
|
||||
},
|
||||
{
|
||||
"epoch": 0.5640612409347301,
|
||||
"grad_norm": 0.8254402279853821,
|
||||
"learning_rate": 3.590350523771153e-05,
|
||||
"loss": 3.3419,
|
||||
"step": 5600
|
||||
},
|
||||
{
|
||||
"epoch": 0.5741337630942788,
|
||||
"grad_norm": 0.8266107439994812,
|
||||
"learning_rate": 3.565169218372281e-05,
|
||||
"loss": 3.3529,
|
||||
"step": 5700
|
||||
},
|
||||
{
|
||||
"epoch": 0.5842062852538276,
|
||||
"grad_norm": 0.8199731111526489,
|
||||
"learning_rate": 3.539987912973409e-05,
|
||||
"loss": 3.3425,
|
||||
"step": 5800
|
||||
},
|
||||
{
|
||||
"epoch": 0.5942788074133764,
|
||||
"grad_norm": 0.8126055002212524,
|
||||
"learning_rate": 3.514806607574537e-05,
|
||||
"loss": 3.345,
|
||||
"step": 5900
|
||||
},
|
||||
{
|
||||
"epoch": 0.604351329572925,
|
||||
"grad_norm": 0.8237761855125427,
|
||||
"learning_rate": 3.489625302175665e-05,
|
||||
"loss": 3.3411,
|
||||
"step": 6000
|
||||
},
|
||||
{
|
||||
"epoch": 0.6144238517324738,
|
||||
"grad_norm": 0.8377647995948792,
|
||||
"learning_rate": 3.4644439967767936e-05,
|
||||
"loss": 3.3342,
|
||||
"step": 6100
|
||||
},
|
||||
{
|
||||
"epoch": 0.6244963738920226,
|
||||
"grad_norm": 0.8051643371582031,
|
||||
"learning_rate": 3.43951450443191e-05,
|
||||
"loss": 3.3316,
|
||||
"step": 6200
|
||||
},
|
||||
{
|
||||
"epoch": 0.6345688960515713,
|
||||
"grad_norm": 0.8488379716873169,
|
||||
"learning_rate": 3.414333199033038e-05,
|
||||
"loss": 3.3368,
|
||||
"step": 6300
|
||||
},
|
||||
{
|
||||
"epoch": 0.6446414182111201,
|
||||
"grad_norm": 0.8671479225158691,
|
||||
"learning_rate": 3.3891518936341657e-05,
|
||||
"loss": 3.3372,
|
||||
"step": 6400
|
||||
},
|
||||
{
|
||||
"epoch": 0.6547139403706688,
|
||||
"grad_norm": 0.8239823579788208,
|
||||
"learning_rate": 3.363970588235294e-05,
|
||||
"loss": 3.3266,
|
||||
"step": 6500
|
||||
},
|
||||
{
|
||||
"epoch": 0.6647864625302176,
|
||||
"grad_norm": 0.8625733852386475,
|
||||
"learning_rate": 3.338789282836422e-05,
|
||||
"loss": 3.3313,
|
||||
"step": 6600
|
||||
},
|
||||
{
|
||||
"epoch": 0.6748589846897664,
|
||||
"grad_norm": 0.8307435512542725,
|
||||
"learning_rate": 3.31360797743755e-05,
|
||||
"loss": 3.3334,
|
||||
"step": 6700
|
||||
},
|
||||
{
|
||||
"epoch": 0.684931506849315,
|
||||
"grad_norm": 0.8079032301902771,
|
||||
"learning_rate": 3.288426672038678e-05,
|
||||
"loss": 3.3326,
|
||||
"step": 6800
|
||||
},
|
||||
{
|
||||
"epoch": 0.6950040290088638,
|
||||
"grad_norm": 0.8174043893814087,
|
||||
"learning_rate": 3.263245366639806e-05,
|
||||
"loss": 3.3257,
|
||||
"step": 6900
|
||||
},
|
||||
{
|
||||
"epoch": 0.7050765511684126,
|
||||
"grad_norm": 0.8356669545173645,
|
||||
"learning_rate": 3.238064061240934e-05,
|
||||
"loss": 3.3254,
|
||||
"step": 7000
|
||||
},
|
||||
{
|
||||
"epoch": 0.7151490733279613,
|
||||
"grad_norm": 0.8560519814491272,
|
||||
"learning_rate": 3.212882755842063e-05,
|
||||
"loss": 3.3214,
|
||||
"step": 7100
|
||||
},
|
||||
{
|
||||
"epoch": 0.7252215954875101,
|
||||
"grad_norm": 0.7916297912597656,
|
||||
"learning_rate": 3.187701450443191e-05,
|
||||
"loss": 3.322,
|
||||
"step": 7200
|
||||
},
|
||||
{
|
||||
"epoch": 0.7352941176470589,
|
||||
"grad_norm": 0.8525456786155701,
|
||||
"learning_rate": 3.162520145044319e-05,
|
||||
"loss": 3.3126,
|
||||
"step": 7300
|
||||
},
|
||||
{
|
||||
"epoch": 0.7453666398066076,
|
||||
"grad_norm": 0.7955446243286133,
|
||||
"learning_rate": 3.137338839645447e-05,
|
||||
"loss": 3.3141,
|
||||
"step": 7400
|
||||
},
|
||||
{
|
||||
"epoch": 0.7554391619661563,
|
||||
"grad_norm": 0.85768061876297,
|
||||
"learning_rate": 3.1121575342465756e-05,
|
||||
"loss": 3.3084,
|
||||
"step": 7500
|
||||
},
|
||||
{
|
||||
"epoch": 0.765511684125705,
|
||||
"grad_norm": 0.8224223256111145,
|
||||
"learning_rate": 3.0869762288477036e-05,
|
||||
"loss": 3.3137,
|
||||
"step": 7600
|
||||
},
|
||||
{
|
||||
"epoch": 0.7755842062852538,
|
||||
"grad_norm": 0.8332231640815735,
|
||||
"learning_rate": 3.0617949234488316e-05,
|
||||
"loss": 3.3034,
|
||||
"step": 7700
|
||||
},
|
||||
{
|
||||
"epoch": 0.7856567284448026,
|
||||
"grad_norm": 0.8170804381370544,
|
||||
"learning_rate": 3.03661361804996e-05,
|
||||
"loss": 3.3089,
|
||||
"step": 7800
|
||||
},
|
||||
{
|
||||
"epoch": 0.7957292506043513,
|
||||
"grad_norm": 0.8121609091758728,
|
||||
"learning_rate": 3.011432312651088e-05,
|
||||
"loss": 3.3153,
|
||||
"step": 7900
|
||||
},
|
||||
{
|
||||
"epoch": 0.8058017727639001,
|
||||
"grad_norm": 0.8763326406478882,
|
||||
"learning_rate": 2.986251007252216e-05,
|
||||
"loss": 3.2977,
|
||||
"step": 8000
|
||||
},
|
||||
{
|
||||
"epoch": 0.8158742949234489,
|
||||
"grad_norm": 0.8313045501708984,
|
||||
"learning_rate": 2.961069701853344e-05,
|
||||
"loss": 3.3112,
|
||||
"step": 8100
|
||||
},
|
||||
{
|
||||
"epoch": 0.8259468170829976,
|
||||
"grad_norm": 0.8360570669174194,
|
||||
"learning_rate": 2.9358883964544726e-05,
|
||||
"loss": 3.2993,
|
||||
"step": 8200
|
||||
},
|
||||
{
|
||||
"epoch": 0.8360193392425463,
|
||||
"grad_norm": 0.8124191761016846,
|
||||
"learning_rate": 2.9107070910556006e-05,
|
||||
"loss": 3.3043,
|
||||
"step": 8300
|
||||
},
|
||||
{
|
||||
"epoch": 0.8460918614020951,
|
||||
"grad_norm": 0.8028224110603333,
|
||||
"learning_rate": 2.8855257856567286e-05,
|
||||
"loss": 3.2932,
|
||||
"step": 8400
|
||||
},
|
||||
{
|
||||
"epoch": 0.8561643835616438,
|
||||
"grad_norm": 0.8234061002731323,
|
||||
"learning_rate": 2.8605962933118454e-05,
|
||||
"loss": 3.2919,
|
||||
"step": 8500
|
||||
},
|
||||
{
|
||||
"epoch": 0.8662369057211926,
|
||||
"grad_norm": 0.8246094584465027,
|
||||
"learning_rate": 2.8354149879129737e-05,
|
||||
"loss": 3.3036,
|
||||
"step": 8600
|
||||
},
|
||||
{
|
||||
"epoch": 0.8763094278807413,
|
||||
"grad_norm": 0.7993488311767578,
|
||||
"learning_rate": 2.8102336825141017e-05,
|
||||
"loss": 3.301,
|
||||
"step": 8700
|
||||
},
|
||||
{
|
||||
"epoch": 0.8863819500402901,
|
||||
"grad_norm": 0.8250619173049927,
|
||||
"learning_rate": 2.7850523771152297e-05,
|
||||
"loss": 3.3014,
|
||||
"step": 8800
|
||||
},
|
||||
{
|
||||
"epoch": 0.8964544721998389,
|
||||
"grad_norm": 0.7909018993377686,
|
||||
"learning_rate": 2.7598710717163577e-05,
|
||||
"loss": 3.2997,
|
||||
"step": 8900
|
||||
},
|
||||
{
|
||||
"epoch": 0.9065269943593876,
|
||||
"grad_norm": 0.8329810500144958,
|
||||
"learning_rate": 2.734689766317486e-05,
|
||||
"loss": 3.2909,
|
||||
"step": 9000
|
||||
},
|
||||
{
|
||||
"epoch": 0.9165995165189363,
|
||||
"grad_norm": 0.7919082045555115,
|
||||
"learning_rate": 2.709508460918614e-05,
|
||||
"loss": 3.2974,
|
||||
"step": 9100
|
||||
},
|
||||
{
|
||||
"epoch": 0.9266720386784851,
|
||||
"grad_norm": 0.8208735585212708,
|
||||
"learning_rate": 2.6843271555197423e-05,
|
||||
"loss": 3.2885,
|
||||
"step": 9200
|
||||
},
|
||||
{
|
||||
"epoch": 0.9367445608380338,
|
||||
"grad_norm": 0.8451607823371887,
|
||||
"learning_rate": 2.6591458501208703e-05,
|
||||
"loss": 3.2895,
|
||||
"step": 9300
|
||||
},
|
||||
{
|
||||
"epoch": 0.9468170829975826,
|
||||
"grad_norm": 0.7859387993812561,
|
||||
"learning_rate": 2.6339645447219983e-05,
|
||||
"loss": 3.2808,
|
||||
"step": 9400
|
||||
},
|
||||
{
|
||||
"epoch": 0.9568896051571314,
|
||||
"grad_norm": 0.8202655911445618,
|
||||
"learning_rate": 2.6087832393231267e-05,
|
||||
"loss": 3.2822,
|
||||
"step": 9500
|
||||
},
|
||||
{
|
||||
"epoch": 0.9669621273166801,
|
||||
"grad_norm": 0.8055428266525269,
|
||||
"learning_rate": 2.5836019339242547e-05,
|
||||
"loss": 3.2767,
|
||||
"step": 9600
|
||||
},
|
||||
{
|
||||
"epoch": 0.9770346494762289,
|
||||
"grad_norm": 0.8379536867141724,
|
||||
"learning_rate": 2.558420628525383e-05,
|
||||
"loss": 3.2846,
|
||||
"step": 9700
|
||||
},
|
||||
{
|
||||
"epoch": 0.9871071716357775,
|
||||
"grad_norm": 0.8224022388458252,
|
||||
"learning_rate": 2.533239323126511e-05,
|
||||
"loss": 3.2822,
|
||||
"step": 9800
|
||||
},
|
||||
{
|
||||
"epoch": 0.9971796937953263,
|
||||
"grad_norm": 0.7945728898048401,
|
||||
"learning_rate": 2.508058017727639e-05,
|
||||
"loss": 3.2909,
|
||||
"step": 9900
|
||||
},
|
||||
{
|
||||
"epoch": 1.0072522159548751,
|
||||
"grad_norm": 0.8296171426773071,
|
||||
"learning_rate": 2.4828767123287673e-05,
|
||||
"loss": 3.2785,
|
||||
"step": 10000
|
||||
},
|
||||
{
|
||||
"epoch": 1.017324738114424,
|
||||
"grad_norm": 0.8091270327568054,
|
||||
"learning_rate": 2.4576954069298953e-05,
|
||||
"loss": 3.2586,
|
||||
"step": 10100
|
||||
},
|
||||
{
|
||||
"epoch": 1.0273972602739727,
|
||||
"grad_norm": 0.8024172782897949,
|
||||
"learning_rate": 2.4325141015310236e-05,
|
||||
"loss": 3.2558,
|
||||
"step": 10200
|
||||
},
|
||||
{
|
||||
"epoch": 1.0374697824335213,
|
||||
"grad_norm": 0.80719393491745,
|
||||
"learning_rate": 2.4073327961321516e-05,
|
||||
"loss": 3.2542,
|
||||
"step": 10300
|
||||
},
|
||||
{
|
||||
"epoch": 1.04754230459307,
|
||||
"grad_norm": 0.7982654571533203,
|
||||
"learning_rate": 2.38215149073328e-05,
|
||||
"loss": 3.2499,
|
||||
"step": 10400
|
||||
},
|
||||
{
|
||||
"epoch": 1.0576148267526189,
|
||||
"grad_norm": 0.8148714303970337,
|
||||
"learning_rate": 2.356970185334408e-05,
|
||||
"loss": 3.2567,
|
||||
"step": 10500
|
||||
},
|
||||
{
|
||||
"epoch": 1.0676873489121677,
|
||||
"grad_norm": 0.8775522708892822,
|
||||
"learning_rate": 2.331788879935536e-05,
|
||||
"loss": 3.2449,
|
||||
"step": 10600
|
||||
},
|
||||
{
|
||||
"epoch": 1.0777598710717164,
|
||||
"grad_norm": 0.8066820502281189,
|
||||
"learning_rate": 2.3066075745366643e-05,
|
||||
"loss": 3.255,
|
||||
"step": 10700
|
||||
},
|
||||
{
|
||||
"epoch": 1.087832393231265,
|
||||
"grad_norm": 0.8724685311317444,
|
||||
"learning_rate": 2.2814262691377923e-05,
|
||||
"loss": 3.2485,
|
||||
"step": 10800
|
||||
},
|
||||
{
|
||||
"epoch": 1.0979049153908138,
|
||||
"grad_norm": 0.8520035147666931,
|
||||
"learning_rate": 2.2562449637389206e-05,
|
||||
"loss": 3.2498,
|
||||
"step": 10900
|
||||
},
|
||||
{
|
||||
"epoch": 1.1079774375503626,
|
||||
"grad_norm": 0.8030887246131897,
|
||||
"learning_rate": 2.2310636583400486e-05,
|
||||
"loss": 3.2495,
|
||||
"step": 11000
|
||||
},
|
||||
{
|
||||
"epoch": 1.1180499597099114,
|
||||
"grad_norm": 0.7946292757987976,
|
||||
"learning_rate": 2.2058823529411766e-05,
|
||||
"loss": 3.2576,
|
||||
"step": 11100
|
||||
},
|
||||
{
|
||||
"epoch": 1.1281224818694602,
|
||||
"grad_norm": 0.8343091607093811,
|
||||
"learning_rate": 2.180701047542305e-05,
|
||||
"loss": 3.2547,
|
||||
"step": 11200
|
||||
},
|
||||
{
|
||||
"epoch": 1.1381950040290088,
|
||||
"grad_norm": 0.8491663932800293,
|
||||
"learning_rate": 2.1557715551974214e-05,
|
||||
"loss": 3.2472,
|
||||
"step": 11300
|
||||
},
|
||||
{
|
||||
"epoch": 1.1482675261885575,
|
||||
"grad_norm": 0.8408398628234863,
|
||||
"learning_rate": 2.1305902497985497e-05,
|
||||
"loss": 3.256,
|
||||
"step": 11400
|
||||
},
|
||||
{
|
||||
"epoch": 1.1583400483481063,
|
||||
"grad_norm": 0.8578426837921143,
|
||||
"learning_rate": 2.1054089443996777e-05,
|
||||
"loss": 3.2484,
|
||||
"step": 11500
|
||||
},
|
||||
{
|
||||
"epoch": 1.1684125705076551,
|
||||
"grad_norm": 0.8466009497642517,
|
||||
"learning_rate": 2.080227639000806e-05,
|
||||
"loss": 3.2527,
|
||||
"step": 11600
|
||||
},
|
||||
{
|
||||
"epoch": 1.178485092667204,
|
||||
"grad_norm": 0.8367530107498169,
|
||||
"learning_rate": 2.055046333601934e-05,
|
||||
"loss": 3.2487,
|
||||
"step": 11700
|
||||
},
|
||||
{
|
||||
"epoch": 1.1885576148267527,
|
||||
"grad_norm": 0.8025128245353699,
|
||||
"learning_rate": 2.029865028203062e-05,
|
||||
"loss": 3.2544,
|
||||
"step": 11800
|
||||
},
|
||||
{
|
||||
"epoch": 1.1986301369863013,
|
||||
"grad_norm": 0.8721017241477966,
|
||||
"learning_rate": 2.0046837228041904e-05,
|
||||
"loss": 3.248,
|
||||
"step": 11900
|
||||
},
|
||||
{
|
||||
"epoch": 1.20870265914585,
|
||||
"grad_norm": 0.813346803188324,
|
||||
"learning_rate": 1.9795024174053184e-05,
|
||||
"loss": 3.2481,
|
||||
"step": 12000
|
||||
}
|
||||
],
|
||||
"logging_steps": 100,
|
||||
"max_steps": 19856,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 2,
|
||||
"save_steps": 1000,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": false
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 5.0167808851968e+16,
|
||||
"train_batch_size": 16,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:015134e77b658ff2cb6b63a837b6493eca3b71dbd432840371eabcf2d7a7988d
|
||||
size 5649
|
||||
50259
vocab.json
Normal file
50259
vocab.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user