From c842cdba24c0afc3765a996484deeb40c877cc04 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 10 Apr 2026 23:34:13 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: TIGER-Lab/TIGERScore-13B Source: Original Platform --- .gitattributes | 35 + README.md | 320 +++++ added_tokens.json | 3 + config.json | 27 + configuration.json | 1 + generation_config.json | 10 + model-00001-of-00006.safetensors | 3 + model-00002-of-00006.safetensors | 3 + model-00003-of-00006.safetensors | 3 + model-00004-of-00006.safetensors | 3 + model-00005-of-00006.safetensors | 3 + model-00006-of-00006.safetensors | 3 + model.safetensors.index.json | 370 ++++++ special_tokens_map.json | 30 + tokenizer.model | 3 + tokenizer_config.json | 50 + trainer_state.json | 2011 ++++++++++++++++++++++++++++++ training_args.bin | 3 + 18 files changed, 2881 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 added_tokens.json create mode 100644 config.json create mode 100644 configuration.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00006.safetensors create mode 100644 model-00002-of-00006.safetensors create mode 100644 model-00003-of-00006.safetensors create mode 100644 model-00004-of-00006.safetensors create mode 100644 model-00005-of-00006.safetensors create mode 100644 model-00006-of-00006.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.model create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a6344aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..cdb30b9 --- /dev/null +++ b/README.md @@ -0,0 +1,320 @@ +--- +language: +- en +- zh +- ru +- cs +license: mit +tags: +- text evaluation +- metric +- llm metric +- llama +- tigerscore +datasets: +- TIGER-Lab/MetricInstruct +metrics: +- pearsonr +- spearmanr +pipeline_tag: text2text-generation +model-index: +- name: TIGERScore-13B + results: + - task: + type: text-generation + name: Text Generation + dataset: + name: AI2 Reasoning Challenge (25-Shot) + type: ai2_arc + config: ARC-Challenge + split: test + args: + num_few_shot: 25 + metrics: + - type: acc_norm + value: 59.04 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: HellaSwag (10-Shot) + type: hellaswag + split: validation + args: + num_few_shot: 10 + metrics: + - type: acc_norm + value: 82.79 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MMLU (5-Shot) + type: cais/mmlu + config: all + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 55.07 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: TruthfulQA (0-shot) + type: truthful_qa + config: multiple_choice + split: validation + args: + num_few_shot: 0 + metrics: + - type: mc2 + value: 40.38 + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: Winogrande (5-shot) + type: winogrande + config: winogrande_xl + split: validation + args: + num_few_shot: 5 + metrics: + - type: acc + value: 74.74 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: GSM8k (5-shot) + type: gsm8k + config: main + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 28.73 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TIGER-Lab/TIGERScore-13B + name: Open LLM Leaderboard +--- + + +## TIGERScore + +[Project Page](https://tiger-ai-lab.github.io/TIGERScore/) | [Paper](https://arxiv.org/abs/2310.00752) | [Code](https://github.com/TIGER-AI-Lab/TIGERScore) | [🤗Demo](https://huggingface.co/spaces/TIGER-Lab/TIGERScore) | +[🤗TIGERScore-7B](https://huggingface.co/TIGER-Lab/TIGERScore-7B-V1.2) | [🤗TIGERScore-13B](https://huggingface.co/TIGER-Lab/TIGERScore-13B-V1.2) + +## Introduction + +We present TIGERScore, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Our metric is based on LLaMA-2, trained on our meticulously curated instruction-tuning dataset [MetricInstruct](https://huggingface.co/datasets/TIGER-Lab/MetricInstruct) which covers 6 text generation tasks and 23 text generation datasets. + +Existing automatic metrics are lagging and suffer from issues like 1) **Dependency on references**, 2) **Limited to specific domains**, 3) **Lack of attribution**. Contrary to them, TIGERScore is designed to be driven by natural language instruction and provide detailed error analysis to pinpoint the mistakes in the generated text. + +Specifically, TIGERScore takes an instruction, an associated input context along with a hypothesis output that might contain errors. Then, TIGERScore will evaluate this hypothesis output and list several errors, each consisting of the error location, aspect, explanation and penalty scores (score reduced, starting from 0). The sum of the reduced scores is taken as the overall rating of this output. + +As a reference-free metric, its correlation can even surpass the best existing reference-based metrics. We believe TIGERScore demonstrates the possibility of building universal explainable metrics to evaluate any text generation task. + +## Training Data + +The models are trained on the 🤗 [MetricInstruct Dataset](https://huggingface.co/datasets/TIGER-Lab/MetricInstruct), which covers 6 text generation tasks and 22 text generation datasets. Check out the dataset card for more details. + +## Training Procedure + +The models are fine-tuned with the MetricInstruct dataset using the original Llama-2 model as base models. The training procedure varies for different models based on their sizes. Check out our paper for more details. + +## Evaluation + +Experiments show that TIGERScore surpasses existing baseline metrics in correlation with human ratings on all 6 held-in tasks and 1 held-out task, achiving the highest overall performance. We hope the emergence of TIGERScore can promote the research in the LLM community as a powerful, interpretable, and easy-to-use metric. + +### Kendall Results +| Tasks⟶ | Summarization | Translation | Data2Text | Long-form QA | MathQA | Instruction Following | Story-Gen | Average | +|----------------------------------------|-----------|-----------|-----------------|-----------|-----------|-----------|-----------|-----------| +| | | | GPT-based | Metrics | | | | | +| GPT-3.5-turbo (few-shot) | **30.45** | 32.3 | 30.38 | 20.91 | **58.57** | 17.73 | 3.26 | 27.65 | +| GPT-4 (zero-shot) | 29.32 | **35.38** | **32.26** | **35.85** | 46.63 | **49.5** | **25.69** | **36.38** | +| | | | Reference-based | Metrics | | | | | +| BLEU | 8.71 | 14.5 | 23.13 | 7.73 | 17.25 | 35.92 | -0.89 | 15.19 | +| ROUGE-2f | 10.67 | 13.19 | 24.74 | 11.73 | 18.07 | 34.59 | 1.78 | 16.4 | +| InstructScore | 20.86 | 40.44 | 30.21 | 15.64 | -3.87 | 13.87 | 13.5 | 18.66 | +| GPTScore-ref | 10.8 | 18.74 | 27.47 | 22.13 | 14.86 | 25.4 | 12.78 | 18.88 | +| BARTScore-cnn (hypo-ref) | 10 | 21.06 | 27.04 | 20.67 | **19.07** | 24.7 | 18.58 | 20.16 | +| BARTScore-para (hypo-ref) | 10.41 | 24.9 | 28.42 | 20.24 | 14.1 | 26.13 | 12.11 | 19.47 | +| BERTScore | 17.39 | 31.57 | 30.74 | 17.7 | 9.41 | 35.61 | 2 | 20.63 | +| BLEURT | 12.69 | 36.12 | **34.48** | 23.11 | 2.88 | 27.94 | 19.18 | 22.34 | +| UniEval (summ) | **35.89** | 16.08 | 28.56 | **29.32** | 16.15 | 11.93 | **31.22** | 24.17 | +| COMET-22 | 25.01 | **42.79** | 23.43 | 24.66 | -4.52 | **36.17** | 27.52 | **25.01** | +| | | | Reference-free |Metrics | | | | | +| BARTScore-para (src-hypo) | 29.12 | 7.01 | 22.32 | 18.8 | -2.21 | 4.26 | 14.15 | 13.35 | +| BARTScore-cnn (src-hypo) | 26.63 | 9.4 | 23.69 | 28.93 | 1.23 | 19.09 | 23.29 | 18.89 | +| Llama-2-13b-chat-0-shot | 25.22 | 11.79 | 23.45 | 15.96 | 1.08 | 19.5 | 21.52 | 16.93 | +| COMETKiwi | 11.87 | 36.37 | 19.08 | 12.23 | -9.38 | 26.46 | 12.78 | 15.63 | +| GPTScore-src | 28.2 | 6.5 | 19.81 | 27.64 | 11.64 | 20.04 | 16.36 | 18.6 | +| TigerScore-7B | 28.79 | 33.65 | 32.44 | 33.93 | 19.98 | 38.13 | 29.72 | 30.95 | +| TigerScore-13B | **31.29** | **36.5** | **36.43** | **33.17** | **21.58** | **41.84** | **35.33** | **33.73** | +| ∆ (ours - best reference-free) | +2 | +0 | +13 | +4 | +10 | +15 | +14 | +15 | +| ∆ (ours - best reference-based) | -4 | -6 | +2 | +4 | +2 | +5 | +4 | +8 | + +### Pearson Results + +| Tasks⟶ | Summarization | Translation | Data2Text | Long-form QA | MathQA | Instruction Following | Story-Gen | Average | +|-------------------------------|-----------|-----------|-----------------|-----------|-----------|-----------|-----------|-----------| +| | | | GPT-based | Metrics | | | | | +| GPT-3.5-turbo (few-shot) | **45.53** | **43.77** | **47.76** | 29.84 | **61.26** | 15.36 | 7.8 | 35.9 | +| GPT-4 (zero-shot) | 40.75 | 33.92 | 46.83 | **49.3** | 54.98 | **60.45** | **37.74** | **46.28** | +| | | | Reference-based | Metrics | | | | | +| BLEU | 11.66 | 17.47 | 34.29 | 18.21 | 18.12 | 29.47 | -0.64 | 18.37 | +| ROUGE-2f | 16.03 | 16.26 | 35.85 | 19.66 | 20.69 | 33.49 | 2.88 | 20.69 | +| InstructScore | 27.4 | 51.55 | 47.28 | 20.59 | 0.36 | 20.98 | 12.81 | 25.85 | +| GPTScore-ref | 13.47 | 21.05 | 48.7 | 33.4 | 18.22 | 29.66 | 18.94 | 26.2 | +| BARTScore-cnn (hypo-ref) | 16.67 | 23.56 | 45.08 | 32.78 | **23.09** | 26.57 | 27.61 | 27.91 | +| BARTScore-para (hypo-ref) | 19.73 | 29.04 | 47.89 | 32.7 | 17.33 | 30.2 | 17.76 | 27.81 | +| BERTScore | 26.26 | 37.65 | 48.22 | 26.39 | 11.19 | 45.58 | 4.08 | 28.48 | +| BLEURT | 17.27 | 43 | **54.32** | 34.26 | 3.98 | 39.15 | 27.89 | 31.41 | +| UniEval (summ) | **53.22** | 23.11 | 51.14 | **36.95** | 17.69 | 30.87 | **44.88** | 36.84 | +| COMET-22 | 35.32 | **58.46** | 43.82 | 36.79 | -5.58 | **49.68** | 40.12 | **36.94** | +| | | | Reference-free | Metrics | | | | | +| BARTScore-para (src-hypo) | 43.11 | 6.96 | 37.82 | 29.86 | -0.41 | 19.37 | 19.99 | 22.38 | +| BARTScore-cnn (src-hypo) | 39.72 | 9.53 | 45.43 | 41.48 | 3.28 | 34.97 | 33.51 | 29.7 | +| Llama-2-13b-chat-0-shot | 29.59 | 9.09 | 41.32 | 21.67 | 2.8 | 22.71 | 21.13 | 21.19 | +| COMETKiwi | 14.22 | **50.91** | 23.63 | 22.59 | -13.35 | 34.46 | 19.12 | 21.65 | +| GPTScore-src | 41.71 | 6.82 | 41.19 | 39.79 | 13.99 | 27.59 | 23.22 | 27.76 | +| TigerScore-7B | 43.95 | 37.7 | 49.13 | **46.1** | 21.77 | 38.26 | 39.9 | 39.54 | +| TigerScore-13B | **44.21** | 41.54 | **52.87** | 44.76 | **24.41** | **47.52** | **47.66** | **43.28** | +| ∆ (ours - best reference-free) | +1 | -9 | +7 | +5 | +10 | +20 | +14 | +13 | +| ∆ (ours - best reference-based) | -9 | -17 | -2 | +9 | +1 | -2 | +3 | +6 | + +### Spearman Results + +| Tasks⟶ | Summarization | Translation | Data2Text | Long-form QA | MathQA | Instruction Following | Story-Gen | Average | +|-------------------------------------------|----------------|----------------|----------------|-----------------|----------------|----------------|----------------|----------------| +| | | | GPT-based | Metrics | | | | | +| GPT-3.5-turbo (few-shot) | **38.50** | 40.53 | 40.20 | 29.33 | **66.46** | 23.20 | 4.77 | 34.71 | +| GPT-4 (zero-shot) | 36.46 | **43.87** | **44.04** | **48.95** | 51.71 | **58.53** | **32.48** | **45.15** | +| | | | Reference-based | Metrics | | | | | +| BLEU | 11.98 | 19.73 | 33.29 | 11.38 | 21.12 | **46.61** | -1.17 | 20.42 | +| ROUGE-2f | 14.53 | 17.83 | 35.49 | 16.83 | 22.12 | 44.56 | 2.34 | 21.96 | +| InstructScore | 26.33 | 47.30 | 43.93 | 21.62 | -4.15 | 16.19 | 16.13 | 23.91 | +| GPTScore-ref | 14.73 | 24.95 | 39.42 | 31.60 | 18.20 | 33.14 | 18.24 | 25.75 | +| BARTScore-cnn(hypo-ref) | 13.64 | 28.53 | 36.12 | 29.57 | **23.35** | 32.49 | 26.64 | 27.19 | +| BARTScore-para (hypo-ref) | 17.18 | 33.72 | 40.79 | 28.94 | 17.27 | 34.47 | 17.43 | 27.11 | +| BERTScore | 23.67 | 42.41 | 43.75 | 25.60 | 11.53 | 45.77 | 2.88 | 27.95 | +| BLEURT | 17.30 | 48.41 | **48.76** | 33.26 | 3.53 | 36.46 | 27.52 | 30.75 | +| UniEval(summ) | **47.52** | 21.90 | 38.38 | **41.83** | 19.78 | 16.02 | **44.46** | 32.84 | +| COMET-22 | 33.75 | **56.35** | 33.92 | 35.28 | -5.53 | 46.13 | 39.20 | **34.16** | +| | | | Reference-free | Metrics | | | | | +| BARTScore-para (src-hypo) | **38.68** | 9.60 | 32.26 | 26.86 | -2.70 | 5.92 | 20.55 | 18.74 | +| BARTScore-cnn (src-hypo) | 35.50 | 12.83 | 34.33 | 40.96 | 1.50 | 25.43 | 33.48 | 26.29 | +| Llama-2-13b-chat-0-shot | 28.53 | 14.38 | 29.24 | 19.91 | 1.08 | 21.37 | 26.78 | 20.18 | +| COMETKiwi | 16.27 | **48.48** | 27.90 | 18.05 | -11.48 | 34.86 | 18.47 | 21.79 | +| GPTScore-src | 37.41 | 8.90 | 28.82 | 39.48 | 14.25 | 26.46 | 23.91 | 25.61 | +| TIGERScore-7B (ours) | 35.11 | 41.50 | 42.39 | **47.11** | 21.23 | 43.57 | 39.26 | 38.60 | +| TIGERScore-13B (ours) | 36.81 | 44.99 | **45.88** | 46.22 | **23.32** | **47.03** | **46.36** | **41.52** | +| Δ (ours - best reference-free) | -2 | -3 | +12 | +5 | +9 | +14 | +13 | +16 | +| ∆ (ours - best reference-based) | -9 | -11 | -3 | +5 | -0 | +0 | +2 | +7 | + +## Usage + +TIGERScore can be easily loaded in 2 lines of codes, and provides a friendly scoring interface function. + +To use TIGERScore, first install `tigerscore` with +```bash +pip install git+https://github.com/TIGER-AI-Lab/TIGERScore.git +``` + +Then load the tigerscore model variates according to you needs. +```python +# set up scorer +from tigerscore import TIGERScorer +scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B") # on GPU +# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B", quantized=True) # 4 bit quantization on GPU +# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B", use_vllm=True) # VLLM on GPU, Recommended for faster evaluation (0.2s per instance) +# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B-GGUF", use_llamacpp=True) # 4 bit quantization on CPU +``` + +After loading, you can easily get errors of the provided **hypothesis output** given the **instruction** and **input context** +```python +# example +instruction = "Write an apology letter." +input_context = "Reason: You canceled a plan at the last minute due to illness." +hypo_output = "Hey [Recipient],\n\nI'm really sorry for ditching our plan. I suddenly got an opportunity for a vacation so I took it. I know this might have messed up your plans and I regret that.\n\nDespite being under the weather, I would rather go for an adventure. I hope you can understand my perspective and I hope this incident doesn't change anything between us.\n\nWe can reschedule our plan for another time. Sorry again for the trouble.\n\nPeace out,\n[Your Name]\n\n---" +results = scorer.score([instruction], [hypo_output], [input_context]) +print(results) +``` + +Results are a list of errors with detailed explanations and reasonable penalty scores: +```json +[ + { + "num_errors": 2, + "score": -7.0, + "errors": { + "error_0": { + "location": " \"I suddenly got an opportunity for a vacation so I took it.\"", + "aspect": " Misunderstanding context", + "explanation": " The error lies in the context of the reason for cancelling the plan. The original reason was due to illness, but in the incorrect output, it is stated that the cancellation was due to a vacation opportunity, which is a misunderstanding of the context. The correction would be to stick to the original reason for cancelling.", + "severity": "Major", + "score_reduction": "5.0" + }, + "error_1": { + "location": " \"I hope you can understand my perspective and I hope this incident doesn't change anything between us.\"", + "aspect": " Inappropriate tone", + "explanation": " The tone of this sentence is too casual and lacks regret or apology. It's important to maintain a formal and regretful tone in an apology letter. The sentence could be corrected to something like \"I hope you can find it in your heart to forgive me and let this incident not strain our relationship.\"", + "severity": "Minor", + "score_reduction": "2.0" + } + }, + "raw_output": " The model-generated output contains 2 errors, with a total score reduction of 7.0.\nError location 1: ..." + } +] +``` + +Check more usage at our [Github Usage Doc](https://github.com/TIGER-AI-Lab/TIGERScore#usage). Have Fun! + +## Citation + +If you find our work useful, please cite our paper: +``` +@article{jiang2023TIGERScore, + title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks}, + author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen}, + journal={arXiv preprint arXiv:2310.00752}, + year={2023} +} +``` +# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_TIGER-Lab__TIGERScore-13B) + +| Metric |Value| +|---------------------------------|----:| +|Avg. |56.79| +|AI2 Reasoning Challenge (25-Shot)|59.04| +|HellaSwag (10-Shot) |82.79| +|MMLU (5-Shot) |55.07| +|TruthfulQA (0-shot) |40.38| +|Winogrande (5-shot) |74.74| +|GSM8k (5-shot) |28.73| + diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..e41416d --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,3 @@ +{ + "[PAD]": 32000 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000..15844b9 --- /dev/null +++ b/config.json @@ -0,0 +1,27 @@ +{ + "_name_or_path": "/ML-A100/models/Llama-2-13b-hf", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 40, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.35.1", + "use_cache": true, + "vocab_size": 32001 +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..bbeeda1 --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-generation", "allow_remote": true} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..ee50c6d --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "max_length": 4096, + "pad_token_id": 0, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.35.1" +} diff --git a/model-00001-of-00006.safetensors b/model-00001-of-00006.safetensors new file mode 100644 index 0000000..561e05f --- /dev/null +++ b/model-00001-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d8b53803d945c25ee4c102fe01fe5a0c8d3d4dedbed67678cb0db00164c5b5 +size 4978276040 diff --git a/model-00002-of-00006.safetensors b/model-00002-of-00006.safetensors new file mode 100644 index 0000000..bd02f8f --- /dev/null +++ b/model-00002-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c091def96de2022fe77bdc713df664018ca6a586b01e5506a33d9953a45b39f +size 4970422232 diff --git a/model-00003-of-00006.safetensors b/model-00003-of-00006.safetensors new file mode 100644 index 0000000..d7b91e3 --- /dev/null +++ b/model-00003-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb03620e493ab9e819b898eff12aef0a128b26982b0c4d557ad528df518e7ea +size 4970422256 diff --git a/model-00004-of-00006.safetensors b/model-00004-of-00006.safetensors new file mode 100644 index 0000000..482f1f1 --- /dev/null +++ b/model-00004-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c385a8cf536eb88603598a5b1602e31323fe53ed2abb4c720e50ca5129b6bf9f +size 4933701504 diff --git a/model-00005-of-00006.safetensors b/model-00005-of-00006.safetensors new file mode 100644 index 0000000..3827210 --- /dev/null +++ b/model-00005-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a524eeee2916df42aa936889bd7034c6f75fb1ae8b8d9ab85d9a4117a69b56 +size 4933722216 diff --git a/model-00006-of-00006.safetensors b/model-00006-of-00006.safetensors new file mode 100644 index 0000000..cf1ca29 --- /dev/null +++ b/model-00006-of-00006.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4855e8b0b538bca923c6ae5b927e177b2196500f7c47f55f93af25865a912a +size 1245247160 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..f832eab --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,370 @@ +{ + "metadata": { + "total_size": 26031749120 + }, + "weight_map": { + "lm_head.weight": "model-00006-of-00006.safetensors", + "model.embed_tokens.weight": "model-00001-of-00006.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", + "model.layers.23.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.input_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.30.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", + "model.layers.31.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.36.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.36.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.36.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.input_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.37.mlp.down_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.mlp.up_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", + "model.layers.37.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.38.input_layernorm.weight": "model-00006-of-00006.safetensors", + "model.layers.38.mlp.down_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.38.mlp.up_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", + "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", + "model.layers.39.input_layernorm.weight": "model-00006-of-00006.safetensors", + "model.layers.39.mlp.down_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.mlp.gate_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.mlp.up_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", + "model.layers.39.self_attn.k_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.self_attn.q_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.39.self_attn.v_proj.weight": "model-00006-of-00006.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", + "model.norm.weight": "model-00006-of-00006.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..0fe0f75 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..6c00c74 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..9d161b4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,50 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1024, + "pad_token": "[PAD]", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..cf4511f --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2011 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 1.3082402064781276e-06, + "loss": 0.7935, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 2.6164804129562553e-06, + "loss": 0.7, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 3.381751875681663e-06, + "loss": 0.6316, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 3.924720619434383e-06, + "loss": 0.5405, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 4.345879896760937e-06, + "loss": 0.522, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 4.689992082159791e-06, + "loss": 0.4827, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 4.980934789368156e-06, + "loss": 0.4555, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 5.2329608259125105e-06, + "loss": 0.45, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 5.4552635448851985e-06, + "loss": 0.4315, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 5.6541201032390644e-06, + "loss": 0.4153, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 5.83400774154115e-06, + "loss": 0.4374, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 5.99823228863792e-06, + "loss": 0.4171, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 6.149304227398896e-06, + "loss": 0.41, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 6.289174995846284e-06, + "loss": 0.4101, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 6.419391565964472e-06, + "loss": 0.4075, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 6.541201032390639e-06, + "loss": 0.395, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 6.655623437887147e-06, + "loss": 0.3873, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 6.763503751363326e-06, + "loss": 0.4005, + "step": 36 + }, + { + "epoch": 0.11, + "learning_rate": 6.865549773769684e-06, + "loss": 0.3754, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 6.9623603097171925e-06, + "loss": 0.3733, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 7.054446458571692e-06, + "loss": 0.3622, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 7.1422479480192775e-06, + "loss": 0.369, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 7.226145833886759e-06, + "loss": 0.3745, + "step": 46 + }, + { + "epoch": 0.14, + "learning_rate": 7.306472495116047e-06, + "loss": 0.3775, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 7.3835195870437456e-06, + "loss": 0.3652, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 7.457544433877025e-06, + "loss": 0.3645, + "step": 52 + }, + { + "epoch": 0.16, + "learning_rate": 7.528775214088733e-06, + "loss": 0.3554, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 7.597415202324413e-06, + "loss": 0.3591, + "step": 56 + }, + { + "epoch": 0.17, + "learning_rate": 7.663646266610644e-06, + "loss": 0.3797, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 7.7276317724426e-06, + "loss": 0.363, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 7.789519010511834e-06, + "loss": 0.367, + "step": 62 + }, + { + "epoch": 0.19, + "learning_rate": 7.849441238868767e-06, + "loss": 0.367, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 7.907519410744684e-06, + "loss": 0.3696, + "step": 66 + }, + { + "epoch": 0.2, + "learning_rate": 7.963863644365277e-06, + "loss": 0.3627, + "step": 68 + }, + { + "epoch": 0.21, + "learning_rate": 8.018574479650967e-06, + "loss": 0.3689, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 8.071743957841455e-06, + "loss": 0.344, + "step": 72 + }, + { + "epoch": 0.22, + "learning_rate": 8.123456553166724e-06, + "loss": 0.3412, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 8.173789980247812e-06, + "loss": 0.3574, + "step": 76 + }, + { + "epoch": 0.23, + "learning_rate": 8.222815896602431e-06, + "loss": 0.339, + "step": 78 + }, + { + "epoch": 0.24, + "learning_rate": 8.270600516195319e-06, + "loss": 0.3547, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 8.317205147216999e-06, + "loss": 0.358, + "step": 82 + }, + { + "epoch": 0.25, + "learning_rate": 8.36268666504982e-06, + "loss": 0.3475, + "step": 84 + }, + { + "epoch": 0.26, + "learning_rate": 8.407097929574588e-06, + "loss": 0.3554, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 8.450488154497406e-06, + "loss": 0.3406, + "step": 88 + }, + { + "epoch": 0.27, + "learning_rate": 8.492903235168008e-06, + "loss": 0.3267, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 8.534386040364887e-06, + "loss": 0.341, + "step": 92 + }, + { + "epoch": 0.28, + "learning_rate": 8.574976672697987e-06, + "loss": 0.3524, + "step": 94 + }, + { + "epoch": 0.29, + "learning_rate": 8.614712701594175e-06, + "loss": 0.3584, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 8.653629372258186e-06, + "loss": 0.3351, + "step": 98 + }, + { + "epoch": 0.3, + "learning_rate": 8.691759793521874e-06, + "loss": 0.3522, + "step": 100 + }, + { + "epoch": 0.31, + "learning_rate": 8.729135107090682e-06, + "loss": 0.3544, + "step": 102 + }, + { + "epoch": 0.31, + "learning_rate": 8.765784640355151e-06, + "loss": 0.3408, + "step": 104 + }, + { + "epoch": 0.32, + "learning_rate": 8.80173604464618e-06, + "loss": 0.3519, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 8.837015420566862e-06, + "loss": 0.3417, + "step": 108 + }, + { + "epoch": 0.33, + "learning_rate": 8.87164743182396e-06, + "loss": 0.3555, + "step": 110 + }, + { + "epoch": 0.34, + "learning_rate": 8.90565540880254e-06, + "loss": 0.3477, + "step": 112 + }, + { + "epoch": 0.34, + "learning_rate": 8.93906144297322e-06, + "loss": 0.3298, + "step": 114 + }, + { + "epoch": 0.35, + "learning_rate": 8.971886473088772e-06, + "loss": 0.3431, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 9.004150364012388e-06, + "loss": 0.3504, + "step": 118 + }, + { + "epoch": 0.36, + "learning_rate": 9.035871978920727e-06, + "loss": 0.3327, + "step": 120 + }, + { + "epoch": 0.37, + "learning_rate": 9.067069245538941e-06, + "loss": 0.3284, + "step": 122 + }, + { + "epoch": 0.37, + "learning_rate": 9.09775921698996e-06, + "loss": 0.3345, + "step": 124 + }, + { + "epoch": 0.38, + "learning_rate": 9.127958127775227e-06, + "loss": 0.3464, + "step": 126 + }, + { + "epoch": 0.39, + "learning_rate": 9.157681445346895e-06, + "loss": 0.336, + "step": 128 + }, + { + "epoch": 0.39, + "learning_rate": 9.186943917681705e-06, + "loss": 0.3546, + "step": 130 + }, + { + "epoch": 0.4, + "learning_rate": 9.215759617222812e-06, + "loss": 0.3194, + "step": 132 + }, + { + "epoch": 0.4, + "learning_rate": 9.244141981517345e-06, + "loss": 0.3296, + "step": 134 + }, + { + "epoch": 0.41, + "learning_rate": 9.272103850843403e-06, + "loss": 0.3331, + "step": 136 + }, + { + "epoch": 0.42, + "learning_rate": 9.299657503090295e-06, + "loss": 0.3362, + "step": 138 + }, + { + "epoch": 0.42, + "learning_rate": 9.326814686129093e-06, + "loss": 0.3414, + "step": 140 + }, + { + "epoch": 0.43, + "learning_rate": 9.353586647887207e-06, + "loss": 0.3249, + "step": 142 + }, + { + "epoch": 0.43, + "learning_rate": 9.379984164319582e-06, + "loss": 0.3334, + "step": 144 + }, + { + "epoch": 0.44, + "learning_rate": 9.406017565450707e-06, + "loss": 0.3294, + "step": 146 + }, + { + "epoch": 0.45, + "learning_rate": 9.43169675964485e-06, + "loss": 0.3436, + "step": 148 + }, + { + "epoch": 0.45, + "learning_rate": 9.457031256247281e-06, + "loss": 0.3216, + "step": 150 + }, + { + "epoch": 0.46, + "learning_rate": 9.48203018672594e-06, + "loss": 0.3509, + "step": 152 + }, + { + "epoch": 0.46, + "learning_rate": 9.50670232443118e-06, + "loss": 0.326, + "step": 154 + }, + { + "epoch": 0.47, + "learning_rate": 9.53105610308056e-06, + "loss": 0.3391, + "step": 156 + }, + { + "epoch": 0.48, + "learning_rate": 9.555099634066188e-06, + "loss": 0.3264, + "step": 158 + }, + { + "epoch": 0.48, + "learning_rate": 9.578840722673449e-06, + "loss": 0.3245, + "step": 160 + }, + { + "epoch": 0.49, + "learning_rate": 9.602286883292267e-06, + "loss": 0.333, + "step": 162 + }, + { + "epoch": 0.49, + "learning_rate": 9.625445353695127e-06, + "loss": 0.3491, + "step": 164 + }, + { + "epoch": 0.5, + "learning_rate": 9.648323108449636e-06, + "loss": 0.34, + "step": 166 + }, + { + "epoch": 0.51, + "learning_rate": 9.670926871527948e-06, + "loss": 0.3343, + "step": 168 + }, + { + "epoch": 0.51, + "learning_rate": 9.693263128169957e-06, + "loss": 0.3311, + "step": 170 + }, + { + "epoch": 0.52, + "learning_rate": 9.715338136052716e-06, + "loss": 0.333, + "step": 172 + }, + { + "epoch": 0.52, + "learning_rate": 9.73715793581418e-06, + "loss": 0.3002, + "step": 174 + }, + { + "epoch": 0.53, + "learning_rate": 9.758728360975532e-06, + "loss": 0.3271, + "step": 176 + }, + { + "epoch": 0.54, + "learning_rate": 9.780055047302923e-06, + "loss": 0.323, + "step": 178 + }, + { + "epoch": 0.54, + "learning_rate": 9.801143441646136e-06, + "loss": 0.3562, + "step": 180 + }, + { + "epoch": 0.55, + "learning_rate": 9.821998810288924e-06, + "loss": 0.3228, + "step": 182 + }, + { + "epoch": 0.55, + "learning_rate": 9.842626246843015e-06, + "loss": 0.3422, + "step": 184 + }, + { + "epoch": 0.56, + "learning_rate": 9.863030679715369e-06, + "loss": 0.3296, + "step": 186 + }, + { + "epoch": 0.57, + "learning_rate": 9.883216879176116e-06, + "loss": 0.3176, + "step": 188 + }, + { + "epoch": 0.57, + "learning_rate": 9.903189464052494e-06, + "loss": 0.3297, + "step": 190 + }, + { + "epoch": 0.58, + "learning_rate": 9.922952908072303e-06, + "loss": 0.3246, + "step": 192 + }, + { + "epoch": 0.58, + "learning_rate": 9.942511545878664e-06, + "loss": 0.3261, + "step": 194 + }, + { + "epoch": 0.59, + "learning_rate": 9.961869578736312e-06, + "loss": 0.3258, + "step": 196 + }, + { + "epoch": 0.6, + "learning_rate": 9.98103107994822e-06, + "loss": 0.3269, + "step": 198 + }, + { + "epoch": 0.6, + "learning_rate": 1e-05, + "loss": 0.324, + "step": 200 + }, + { + "epoch": 0.61, + "learning_rate": 1e-05, + "loss": 0.3106, + "step": 202 + }, + { + "epoch": 0.61, + "learning_rate": 1e-05, + "loss": 0.3086, + "step": 204 + }, + { + "epoch": 0.62, + "learning_rate": 1e-05, + "loss": 0.3269, + "step": 206 + }, + { + "epoch": 0.63, + "learning_rate": 1e-05, + "loss": 0.3396, + "step": 208 + }, + { + "epoch": 0.63, + "learning_rate": 1e-05, + "loss": 0.3249, + "step": 210 + }, + { + "epoch": 0.64, + "learning_rate": 1e-05, + "loss": 0.3294, + "step": 212 + }, + { + "epoch": 0.64, + "learning_rate": 1e-05, + "loss": 0.3502, + "step": 214 + }, + { + "epoch": 0.65, + "learning_rate": 1e-05, + "loss": 0.3329, + "step": 216 + }, + { + "epoch": 0.66, + "learning_rate": 1e-05, + "loss": 0.3501, + "step": 218 + }, + { + "epoch": 0.66, + "learning_rate": 1e-05, + "loss": 0.3375, + "step": 220 + }, + { + "epoch": 0.67, + "learning_rate": 1e-05, + "loss": 0.3326, + "step": 222 + }, + { + "epoch": 0.67, + "learning_rate": 1e-05, + "loss": 0.3312, + "step": 224 + }, + { + "epoch": 0.68, + "learning_rate": 1e-05, + "loss": 0.3246, + "step": 226 + }, + { + "epoch": 0.69, + "learning_rate": 1e-05, + "loss": 0.3142, + "step": 228 + }, + { + "epoch": 0.69, + "learning_rate": 1e-05, + "loss": 0.3246, + "step": 230 + }, + { + "epoch": 0.7, + "learning_rate": 1e-05, + "loss": 0.329, + "step": 232 + }, + { + "epoch": 0.7, + "learning_rate": 1e-05, + "loss": 0.3446, + "step": 234 + }, + { + "epoch": 0.71, + "learning_rate": 1e-05, + "loss": 0.3205, + "step": 236 + }, + { + "epoch": 0.72, + "learning_rate": 1e-05, + "loss": 0.3066, + "step": 238 + }, + { + "epoch": 0.72, + "learning_rate": 1e-05, + "loss": 0.3267, + "step": 240 + }, + { + "epoch": 0.73, + "learning_rate": 1e-05, + "loss": 0.3169, + "step": 242 + }, + { + "epoch": 0.73, + "learning_rate": 1e-05, + "loss": 0.333, + "step": 244 + }, + { + "epoch": 0.74, + "learning_rate": 1e-05, + "loss": 0.3155, + "step": 246 + }, + { + "epoch": 0.75, + "learning_rate": 1e-05, + "loss": 0.3263, + "step": 248 + }, + { + "epoch": 0.75, + "learning_rate": 1e-05, + "loss": 0.3154, + "step": 250 + }, + { + "epoch": 0.76, + "learning_rate": 1e-05, + "loss": 0.3303, + "step": 252 + }, + { + "epoch": 0.77, + "learning_rate": 1e-05, + "loss": 0.3298, + "step": 254 + }, + { + "epoch": 0.77, + "learning_rate": 1e-05, + "loss": 0.3319, + "step": 256 + }, + { + "epoch": 0.78, + "learning_rate": 1e-05, + "loss": 0.3178, + "step": 258 + }, + { + "epoch": 0.78, + "learning_rate": 1e-05, + "loss": 0.3178, + "step": 260 + }, + { + "epoch": 0.79, + "learning_rate": 1e-05, + "loss": 0.3163, + "step": 262 + }, + { + "epoch": 0.8, + "learning_rate": 1e-05, + "loss": 0.2977, + "step": 264 + }, + { + "epoch": 0.8, + "learning_rate": 1e-05, + "loss": 0.3184, + "step": 266 + }, + { + "epoch": 0.81, + "learning_rate": 1e-05, + "loss": 0.3359, + "step": 268 + }, + { + "epoch": 0.81, + "learning_rate": 1e-05, + "loss": 0.321, + "step": 270 + }, + { + "epoch": 0.82, + "learning_rate": 1e-05, + "loss": 0.3275, + "step": 272 + }, + { + "epoch": 0.83, + "learning_rate": 1e-05, + "loss": 0.3198, + "step": 274 + }, + { + "epoch": 0.83, + "learning_rate": 1e-05, + "loss": 0.3352, + "step": 276 + }, + { + "epoch": 0.84, + "learning_rate": 1e-05, + "loss": 0.3387, + "step": 278 + }, + { + "epoch": 0.84, + "learning_rate": 1e-05, + "loss": 0.3176, + "step": 280 + }, + { + "epoch": 0.85, + "learning_rate": 1e-05, + "loss": 0.3278, + "step": 282 + }, + { + "epoch": 0.86, + "learning_rate": 1e-05, + "loss": 0.2959, + "step": 284 + }, + { + "epoch": 0.86, + "learning_rate": 1e-05, + "loss": 0.3213, + "step": 286 + }, + { + "epoch": 0.87, + "learning_rate": 1e-05, + "loss": 0.3154, + "step": 288 + }, + { + "epoch": 0.87, + "learning_rate": 1e-05, + "loss": 0.3158, + "step": 290 + }, + { + "epoch": 0.88, + "learning_rate": 1e-05, + "loss": 0.3068, + "step": 292 + }, + { + "epoch": 0.89, + "learning_rate": 1e-05, + "loss": 0.3178, + "step": 294 + }, + { + "epoch": 0.89, + "learning_rate": 1e-05, + "loss": 0.3241, + "step": 296 + }, + { + "epoch": 0.9, + "learning_rate": 1e-05, + "loss": 0.3192, + "step": 298 + }, + { + "epoch": 0.9, + "learning_rate": 1e-05, + "loss": 0.3238, + "step": 300 + }, + { + "epoch": 0.91, + "learning_rate": 1e-05, + "loss": 0.3259, + "step": 302 + }, + { + "epoch": 0.92, + "learning_rate": 1e-05, + "loss": 0.2884, + "step": 304 + }, + { + "epoch": 0.92, + "learning_rate": 1e-05, + "loss": 0.3148, + "step": 306 + }, + { + "epoch": 0.93, + "learning_rate": 1e-05, + "loss": 0.3133, + "step": 308 + }, + { + "epoch": 0.93, + "learning_rate": 1e-05, + "loss": 0.3141, + "step": 310 + }, + { + "epoch": 0.94, + "learning_rate": 1e-05, + "loss": 0.3187, + "step": 312 + }, + { + "epoch": 0.95, + "learning_rate": 1e-05, + "loss": 0.3211, + "step": 314 + }, + { + "epoch": 0.95, + "learning_rate": 1e-05, + "loss": 0.3198, + "step": 316 + }, + { + "epoch": 0.96, + "learning_rate": 1e-05, + "loss": 0.316, + "step": 318 + }, + { + "epoch": 0.96, + "learning_rate": 1e-05, + "loss": 0.329, + "step": 320 + }, + { + "epoch": 0.97, + "learning_rate": 1e-05, + "loss": 0.3008, + "step": 322 + }, + { + "epoch": 0.98, + "learning_rate": 1e-05, + "loss": 0.3184, + "step": 324 + }, + { + "epoch": 0.98, + "learning_rate": 1e-05, + "loss": 0.3329, + "step": 326 + }, + { + "epoch": 0.99, + "learning_rate": 1e-05, + "loss": 0.3106, + "step": 328 + }, + { + "epoch": 0.99, + "learning_rate": 1e-05, + "loss": 0.3133, + "step": 330 + }, + { + "epoch": 1.0, + "learning_rate": 1e-05, + "loss": 0.3238, + "step": 332 + }, + { + "epoch": 1.01, + "learning_rate": 1e-05, + "loss": 0.3145, + "step": 334 + }, + { + "epoch": 1.01, + "learning_rate": 1e-05, + "loss": 0.3053, + "step": 336 + }, + { + "epoch": 1.02, + "learning_rate": 1e-05, + "loss": 0.3025, + "step": 338 + }, + { + "epoch": 1.02, + "learning_rate": 1e-05, + "loss": 0.2953, + "step": 340 + }, + { + "epoch": 1.03, + "learning_rate": 1e-05, + "loss": 0.302, + "step": 342 + }, + { + "epoch": 1.04, + "learning_rate": 1e-05, + "loss": 0.2956, + "step": 344 + }, + { + "epoch": 1.04, + "learning_rate": 1e-05, + "loss": 0.2763, + "step": 346 + }, + { + "epoch": 1.05, + "learning_rate": 1e-05, + "loss": 0.2848, + "step": 348 + }, + { + "epoch": 1.05, + "learning_rate": 1e-05, + "loss": 0.2664, + "step": 350 + }, + { + "epoch": 1.06, + "learning_rate": 1e-05, + "loss": 0.2641, + "step": 352 + }, + { + "epoch": 1.07, + "learning_rate": 1e-05, + "loss": 0.2783, + "step": 354 + }, + { + "epoch": 1.07, + "learning_rate": 1e-05, + "loss": 0.2757, + "step": 356 + }, + { + "epoch": 1.08, + "learning_rate": 1e-05, + "loss": 0.2647, + "step": 358 + }, + { + "epoch": 1.08, + "learning_rate": 1e-05, + "loss": 0.2725, + "step": 360 + }, + { + "epoch": 1.09, + "learning_rate": 1e-05, + "loss": 0.277, + "step": 362 + }, + { + "epoch": 1.1, + "learning_rate": 1e-05, + "loss": 0.2648, + "step": 364 + }, + { + "epoch": 1.1, + "learning_rate": 1e-05, + "loss": 0.2593, + "step": 366 + }, + { + "epoch": 1.11, + "learning_rate": 1e-05, + "loss": 0.2654, + "step": 368 + }, + { + "epoch": 1.11, + "learning_rate": 1e-05, + "loss": 0.2574, + "step": 370 + }, + { + "epoch": 1.12, + "learning_rate": 1e-05, + "loss": 0.2528, + "step": 372 + }, + { + "epoch": 1.13, + "learning_rate": 1e-05, + "loss": 0.2415, + "step": 374 + }, + { + "epoch": 1.13, + "learning_rate": 1e-05, + "loss": 0.2493, + "step": 376 + }, + { + "epoch": 1.14, + "learning_rate": 1e-05, + "loss": 0.2543, + "step": 378 + }, + { + "epoch": 1.14, + "learning_rate": 1e-05, + "loss": 0.259, + "step": 380 + }, + { + "epoch": 1.15, + "learning_rate": 1e-05, + "loss": 0.2445, + "step": 382 + }, + { + "epoch": 1.16, + "learning_rate": 1e-05, + "loss": 0.2476, + "step": 384 + }, + { + "epoch": 1.16, + "learning_rate": 1e-05, + "loss": 0.2388, + "step": 386 + }, + { + "epoch": 1.17, + "learning_rate": 1e-05, + "loss": 0.2398, + "step": 388 + }, + { + "epoch": 1.17, + "learning_rate": 1e-05, + "loss": 0.2559, + "step": 390 + }, + { + "epoch": 1.18, + "learning_rate": 1e-05, + "loss": 0.2398, + "step": 392 + }, + { + "epoch": 1.19, + "learning_rate": 1e-05, + "loss": 0.245, + "step": 394 + }, + { + "epoch": 1.19, + "learning_rate": 1e-05, + "loss": 0.2502, + "step": 396 + }, + { + "epoch": 1.2, + "learning_rate": 1e-05, + "loss": 0.2541, + "step": 398 + }, + { + "epoch": 1.2, + "learning_rate": 1e-05, + "loss": 0.2433, + "step": 400 + }, + { + "epoch": 1.21, + "learning_rate": 1e-05, + "loss": 0.2486, + "step": 402 + }, + { + "epoch": 1.22, + "learning_rate": 1e-05, + "loss": 0.2297, + "step": 404 + }, + { + "epoch": 1.22, + "learning_rate": 1e-05, + "loss": 0.2286, + "step": 406 + }, + { + "epoch": 1.23, + "learning_rate": 1e-05, + "loss": 0.2425, + "step": 408 + }, + { + "epoch": 1.23, + "learning_rate": 1e-05, + "loss": 0.2271, + "step": 410 + }, + { + "epoch": 1.24, + "learning_rate": 1e-05, + "loss": 0.2394, + "step": 412 + }, + { + "epoch": 1.25, + "learning_rate": 1e-05, + "loss": 0.2446, + "step": 414 + }, + { + "epoch": 1.25, + "learning_rate": 1e-05, + "loss": 0.2342, + "step": 416 + }, + { + "epoch": 1.26, + "learning_rate": 1e-05, + "loss": 0.2423, + "step": 418 + }, + { + "epoch": 1.27, + "learning_rate": 1e-05, + "loss": 0.2334, + "step": 420 + }, + { + "epoch": 1.27, + "learning_rate": 1e-05, + "loss": 0.2213, + "step": 422 + }, + { + "epoch": 1.28, + "learning_rate": 1e-05, + "loss": 0.2254, + "step": 424 + }, + { + "epoch": 1.28, + "learning_rate": 1e-05, + "loss": 0.2385, + "step": 426 + }, + { + "epoch": 1.29, + "learning_rate": 1e-05, + "loss": 0.2439, + "step": 428 + }, + { + "epoch": 1.3, + "learning_rate": 1e-05, + "loss": 0.2314, + "step": 430 + }, + { + "epoch": 1.3, + "learning_rate": 1e-05, + "loss": 0.2423, + "step": 432 + }, + { + "epoch": 1.31, + "learning_rate": 1e-05, + "loss": 0.2387, + "step": 434 + }, + { + "epoch": 1.31, + "learning_rate": 1e-05, + "loss": 0.2283, + "step": 436 + }, + { + "epoch": 1.32, + "learning_rate": 1e-05, + "loss": 0.2358, + "step": 438 + }, + { + "epoch": 1.33, + "learning_rate": 1e-05, + "loss": 0.2271, + "step": 440 + }, + { + "epoch": 1.33, + "learning_rate": 1e-05, + "loss": 0.2403, + "step": 442 + }, + { + "epoch": 1.34, + "learning_rate": 1e-05, + "loss": 0.2365, + "step": 444 + }, + { + "epoch": 1.34, + "learning_rate": 1e-05, + "loss": 0.2206, + "step": 446 + }, + { + "epoch": 1.35, + "learning_rate": 1e-05, + "loss": 0.2329, + "step": 448 + }, + { + "epoch": 1.36, + "learning_rate": 1e-05, + "loss": 0.23, + "step": 450 + }, + { + "epoch": 1.36, + "learning_rate": 1e-05, + "loss": 0.2177, + "step": 452 + }, + { + "epoch": 1.37, + "learning_rate": 1e-05, + "loss": 0.219, + "step": 454 + }, + { + "epoch": 1.37, + "learning_rate": 1e-05, + "loss": 0.226, + "step": 456 + }, + { + "epoch": 1.38, + "learning_rate": 1e-05, + "loss": 0.2267, + "step": 458 + }, + { + "epoch": 1.39, + "learning_rate": 1e-05, + "loss": 0.2249, + "step": 460 + }, + { + "epoch": 1.39, + "learning_rate": 1e-05, + "loss": 0.2368, + "step": 462 + }, + { + "epoch": 1.4, + "learning_rate": 1e-05, + "loss": 0.2146, + "step": 464 + }, + { + "epoch": 1.4, + "learning_rate": 1e-05, + "loss": 0.218, + "step": 466 + }, + { + "epoch": 1.41, + "learning_rate": 1e-05, + "loss": 0.2304, + "step": 468 + }, + { + "epoch": 1.42, + "learning_rate": 1e-05, + "loss": 0.2314, + "step": 470 + }, + { + "epoch": 1.42, + "learning_rate": 1e-05, + "loss": 0.2295, + "step": 472 + }, + { + "epoch": 1.43, + "learning_rate": 1e-05, + "loss": 0.22, + "step": 474 + }, + { + "epoch": 1.43, + "learning_rate": 1e-05, + "loss": 0.2172, + "step": 476 + }, + { + "epoch": 1.44, + "learning_rate": 1e-05, + "loss": 0.2244, + "step": 478 + }, + { + "epoch": 1.45, + "learning_rate": 1e-05, + "loss": 0.2301, + "step": 480 + }, + { + "epoch": 1.45, + "learning_rate": 1e-05, + "loss": 0.2069, + "step": 482 + }, + { + "epoch": 1.46, + "learning_rate": 1e-05, + "loss": 0.2342, + "step": 484 + }, + { + "epoch": 1.46, + "learning_rate": 1e-05, + "loss": 0.2162, + "step": 486 + }, + { + "epoch": 1.47, + "learning_rate": 1e-05, + "loss": 0.2184, + "step": 488 + }, + { + "epoch": 1.48, + "learning_rate": 1e-05, + "loss": 0.2369, + "step": 490 + }, + { + "epoch": 1.48, + "learning_rate": 1e-05, + "loss": 0.2053, + "step": 492 + }, + { + "epoch": 1.49, + "learning_rate": 1e-05, + "loss": 0.2148, + "step": 494 + }, + { + "epoch": 1.49, + "learning_rate": 1e-05, + "loss": 0.227, + "step": 496 + }, + { + "epoch": 1.5, + "learning_rate": 1e-05, + "loss": 0.2252, + "step": 498 + }, + { + "epoch": 1.51, + "learning_rate": 1e-05, + "loss": 0.2183, + "step": 500 + }, + { + "epoch": 1.51, + "learning_rate": 1e-05, + "loss": 0.2222, + "step": 502 + }, + { + "epoch": 1.52, + "learning_rate": 1e-05, + "loss": 0.2158, + "step": 504 + }, + { + "epoch": 1.52, + "learning_rate": 1e-05, + "loss": 0.1991, + "step": 506 + }, + { + "epoch": 1.53, + "learning_rate": 1e-05, + "loss": 0.2093, + "step": 508 + }, + { + "epoch": 1.54, + "learning_rate": 1e-05, + "loss": 0.2021, + "step": 510 + }, + { + "epoch": 1.54, + "learning_rate": 1e-05, + "loss": 0.2392, + "step": 512 + }, + { + "epoch": 1.55, + "learning_rate": 1e-05, + "loss": 0.2119, + "step": 514 + }, + { + "epoch": 1.55, + "learning_rate": 1e-05, + "loss": 0.2261, + "step": 516 + }, + { + "epoch": 1.56, + "learning_rate": 1e-05, + "loss": 0.2138, + "step": 518 + }, + { + "epoch": 1.57, + "learning_rate": 1e-05, + "loss": 0.2029, + "step": 520 + }, + { + "epoch": 1.57, + "learning_rate": 1e-05, + "loss": 0.2107, + "step": 522 + }, + { + "epoch": 1.58, + "learning_rate": 1e-05, + "loss": 0.2078, + "step": 524 + }, + { + "epoch": 1.58, + "learning_rate": 1e-05, + "loss": 0.2187, + "step": 526 + }, + { + "epoch": 1.59, + "learning_rate": 1e-05, + "loss": 0.2174, + "step": 528 + }, + { + "epoch": 1.6, + "learning_rate": 1e-05, + "loss": 0.2165, + "step": 530 + }, + { + "epoch": 1.6, + "learning_rate": 1e-05, + "loss": 0.2098, + "step": 532 + }, + { + "epoch": 1.61, + "learning_rate": 1e-05, + "loss": 0.199, + "step": 534 + }, + { + "epoch": 1.61, + "learning_rate": 1e-05, + "loss": 0.2002, + "step": 536 + }, + { + "epoch": 1.62, + "learning_rate": 1e-05, + "loss": 0.2137, + "step": 538 + }, + { + "epoch": 1.63, + "learning_rate": 1e-05, + "loss": 0.2176, + "step": 540 + }, + { + "epoch": 1.63, + "learning_rate": 1e-05, + "loss": 0.2153, + "step": 542 + }, + { + "epoch": 1.64, + "learning_rate": 1e-05, + "loss": 0.2164, + "step": 544 + }, + { + "epoch": 1.64, + "learning_rate": 1e-05, + "loss": 0.2305, + "step": 546 + }, + { + "epoch": 1.65, + "learning_rate": 1e-05, + "loss": 0.2182, + "step": 548 + }, + { + "epoch": 1.66, + "learning_rate": 1e-05, + "loss": 0.2228, + "step": 550 + }, + { + "epoch": 1.66, + "learning_rate": 1e-05, + "loss": 0.2207, + "step": 552 + }, + { + "epoch": 1.67, + "learning_rate": 1e-05, + "loss": 0.224, + "step": 554 + }, + { + "epoch": 1.67, + "learning_rate": 1e-05, + "loss": 0.2201, + "step": 556 + }, + { + "epoch": 1.68, + "learning_rate": 1e-05, + "loss": 0.2137, + "step": 558 + }, + { + "epoch": 1.69, + "learning_rate": 1e-05, + "loss": 0.2097, + "step": 560 + }, + { + "epoch": 1.69, + "learning_rate": 1e-05, + "loss": 0.2155, + "step": 562 + }, + { + "epoch": 1.7, + "learning_rate": 1e-05, + "loss": 0.2166, + "step": 564 + }, + { + "epoch": 1.7, + "learning_rate": 1e-05, + "loss": 0.2302, + "step": 566 + }, + { + "epoch": 1.71, + "learning_rate": 1e-05, + "loss": 0.2078, + "step": 568 + }, + { + "epoch": 1.72, + "learning_rate": 1e-05, + "loss": 0.1972, + "step": 570 + }, + { + "epoch": 1.72, + "learning_rate": 1e-05, + "loss": 0.2198, + "step": 572 + }, + { + "epoch": 1.73, + "learning_rate": 1e-05, + "loss": 0.213, + "step": 574 + }, + { + "epoch": 1.73, + "learning_rate": 1e-05, + "loss": 0.2183, + "step": 576 + }, + { + "epoch": 1.74, + "learning_rate": 1e-05, + "loss": 0.2044, + "step": 578 + }, + { + "epoch": 1.75, + "learning_rate": 1e-05, + "loss": 0.2163, + "step": 580 + }, + { + "epoch": 1.75, + "learning_rate": 1e-05, + "loss": 0.2031, + "step": 582 + }, + { + "epoch": 1.76, + "learning_rate": 1e-05, + "loss": 0.2228, + "step": 584 + }, + { + "epoch": 1.77, + "learning_rate": 1e-05, + "loss": 0.2271, + "step": 586 + }, + { + "epoch": 1.77, + "learning_rate": 1e-05, + "loss": 0.2196, + "step": 588 + }, + { + "epoch": 1.78, + "learning_rate": 1e-05, + "loss": 0.2134, + "step": 590 + }, + { + "epoch": 1.78, + "learning_rate": 1e-05, + "loss": 0.2019, + "step": 592 + }, + { + "epoch": 1.79, + "learning_rate": 1e-05, + "loss": 0.1986, + "step": 594 + }, + { + "epoch": 1.8, + "learning_rate": 1e-05, + "loss": 0.1943, + "step": 596 + }, + { + "epoch": 1.8, + "learning_rate": 1e-05, + "loss": 0.206, + "step": 598 + }, + { + "epoch": 1.81, + "learning_rate": 1e-05, + "loss": 0.2154, + "step": 600 + }, + { + "epoch": 1.81, + "learning_rate": 1e-05, + "loss": 0.2103, + "step": 602 + }, + { + "epoch": 1.82, + "learning_rate": 1e-05, + "loss": 0.2167, + "step": 604 + }, + { + "epoch": 1.83, + "learning_rate": 1e-05, + "loss": 0.209, + "step": 606 + }, + { + "epoch": 1.83, + "learning_rate": 1e-05, + "loss": 0.2208, + "step": 608 + }, + { + "epoch": 1.84, + "learning_rate": 1e-05, + "loss": 0.2252, + "step": 610 + }, + { + "epoch": 1.84, + "learning_rate": 1e-05, + "loss": 0.2092, + "step": 612 + }, + { + "epoch": 1.85, + "learning_rate": 1e-05, + "loss": 0.2099, + "step": 614 + }, + { + "epoch": 1.86, + "learning_rate": 1e-05, + "loss": 0.1955, + "step": 616 + }, + { + "epoch": 1.86, + "learning_rate": 1e-05, + "loss": 0.2176, + "step": 618 + }, + { + "epoch": 1.87, + "learning_rate": 1e-05, + "loss": 0.2053, + "step": 620 + }, + { + "epoch": 1.87, + "learning_rate": 1e-05, + "loss": 0.2119, + "step": 622 + }, + { + "epoch": 1.88, + "learning_rate": 1e-05, + "loss": 0.1986, + "step": 624 + }, + { + "epoch": 1.89, + "learning_rate": 1e-05, + "loss": 0.2136, + "step": 626 + }, + { + "epoch": 1.89, + "learning_rate": 1e-05, + "loss": 0.2154, + "step": 628 + }, + { + "epoch": 1.9, + "learning_rate": 1e-05, + "loss": 0.2091, + "step": 630 + }, + { + "epoch": 1.9, + "learning_rate": 1e-05, + "loss": 0.2161, + "step": 632 + }, + { + "epoch": 1.91, + "learning_rate": 1e-05, + "loss": 0.2107, + "step": 634 + }, + { + "epoch": 1.92, + "learning_rate": 1e-05, + "loss": 0.1814, + "step": 636 + }, + { + "epoch": 1.92, + "learning_rate": 1e-05, + "loss": 0.2097, + "step": 638 + }, + { + "epoch": 1.93, + "learning_rate": 1e-05, + "loss": 0.2078, + "step": 640 + }, + { + "epoch": 1.93, + "learning_rate": 1e-05, + "loss": 0.1985, + "step": 642 + }, + { + "epoch": 1.94, + "learning_rate": 1e-05, + "loss": 0.2115, + "step": 644 + }, + { + "epoch": 1.95, + "learning_rate": 1e-05, + "loss": 0.2128, + "step": 646 + }, + { + "epoch": 1.95, + "learning_rate": 1e-05, + "loss": 0.2133, + "step": 648 + }, + { + "epoch": 1.96, + "learning_rate": 1e-05, + "loss": 0.2109, + "step": 650 + }, + { + "epoch": 1.96, + "learning_rate": 1e-05, + "loss": 0.2133, + "step": 652 + }, + { + "epoch": 1.97, + "learning_rate": 1e-05, + "loss": 0.198, + "step": 654 + }, + { + "epoch": 1.98, + "learning_rate": 1e-05, + "loss": 0.2068, + "step": 656 + }, + { + "epoch": 1.98, + "learning_rate": 1e-05, + "loss": 0.2222, + "step": 658 + }, + { + "epoch": 1.99, + "learning_rate": 1e-05, + "loss": 0.2035, + "step": 660 + }, + { + "epoch": 1.99, + "learning_rate": 1e-05, + "loss": 0.2067, + "step": 662 + }, + { + "epoch": 2.0, + "learning_rate": 1e-05, + "loss": 0.207, + "step": 664 + } + ], + "logging_steps": 2, + "max_steps": 1992, + "num_train_epochs": 6, + "save_steps": 64.0, + "total_flos": 148901009686528.0, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c071db1 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6da0167127a217efe26c78d229b99ad905ad18eecd77581e01cd9a087834755f +size 6456