初始化项目,由ModelHub XC社区提供模型

Model: Polygl0t/Tucano2-qwen-3.7B-Base
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-25 19:22:52 +08:00
commit 32fbd35202
36 changed files with 465872 additions and 0 deletions

52
.gitattributes vendored Normal file
View File

@@ -0,0 +1,52 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
logo.png filter=lfs diff=lfs merge=lfs -text
.plots/arc_challenge.png filter=lfs diff=lfs merge=lfs -text
.plots/before_and_after.png filter=lfs diff=lfs merge=lfs -text
.plots/belebele.png filter=lfs diff=lfs merge=lfs -text
.plots/bluex.png filter=lfs diff=lfs merge=lfs -text
.plots/calame.png filter=lfs diff=lfs merge=lfs -text
.plots/enem.png filter=lfs diff=lfs merge=lfs -text
.plots/global_piqa.png filter=lfs diff=lfs merge=lfs -text
.plots/gradient_norm.png filter=lfs diff=lfs merge=lfs -text
.plots/hellaswag.png filter=lfs diff=lfs merge=lfs -text
.plots/lambada.png filter=lfs diff=lfs merge=lfs -text
.plots/learning_curve.png filter=lfs diff=lfs merge=lfs -text
.plots/mmlu.png filter=lfs diff=lfs merge=lfs -text
.plots/npm_easy.png filter=lfs diff=lfs merge=lfs -text
.plots/npm_hard.png filter=lfs diff=lfs merge=lfs -text
.plots/oab.png filter=lfs diff=lfs merge=lfs -text
.plots/performance_vs_compute.png filter=lfs diff=lfs merge=lfs -text

3
.plots/arc_challenge.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e8af866351e9918524decc2b6c89b2d42bc1a512a6c9b01939b1135d4624b8ea
size 195444

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9329d9b9e44be3dc8e30d5b1d63e045e82416dfe0497e71c50889411f47415c3
size 278984

3
.plots/belebele.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:13a715235191f3c7e63e65e35a317337a23ceebf01d0ca1d0a988b6e72f0d688
size 194551

3
.plots/bluex.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:359efe6b2879ac4bde29d1924dc9a7fa554d5a07ec13821b145718d55a2fbf9d
size 189109

3
.plots/calame.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:eee7333dd48cda9a8754942803f5a35a4555d7d8c6d1866a39a0cfb81c7a11fa
size 174470

3
.plots/enem.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4dd7c49b0fee952386f5be0aeae54cfd91eddd53f29466981ab8bbe2ef73f4b7
size 163967

3
.plots/global_piqa.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:177b7cca833de2440787821ffd7aa92525d5955dc28c07fc9b7e803afacea56e
size 166269

3
.plots/gradient_norm.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:aab0537cf6a04037928d0ebaa88ab2e84a23c0d869cb0f32bd0c7fd9014f7f9a
size 273517

3
.plots/hellaswag.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3b6a6d0ecb3ef72c2afd1c828e075f16b0cfb365f335007f4bfc8323c9117b52
size 168704

3
.plots/lambada.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8079bd87390d73ee8c40ad849ab1f89e35f95bc29af78fcabd025a5b10ff0eac
size 180169

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0cf6807d2ef3dbbb0b332ff6f08e2cfef52cfbe9936b43bfa0a973e751b94068
size 221929

3
.plots/mmlu.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ef96236e6f0c577dea8fc3b3dfa2739061d7fc6ce1e9a02e51453d8ae62543c4
size 181825

3
.plots/npm_easy.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b83aac4bb477232ed43a9a0af497967d3e3e041ee515b72d5fdc310833881039
size 188554

3
.plots/npm_hard.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f416e9e8d05808313600b206ab53c612c90b2be8325f87b28c5fb8b812066699
size 190920

3
.plots/oab.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0e3e05da44c8ec4e744e6d51b00cd32ff48dfaf94a5036d5498c4fe8a8e9417e
size 191665

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c4a657d6d68d6f2f1fbf0d3750eac546a7ead054995a466497e0e31bfeedd49c
size 949327

190
LICENSE Normal file
View File

@@ -0,0 +1,190 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright Nicholas Kluge Corrêa, Shiza Fatimah, Aniket Sen, and Sophia Falk
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

495
README.md Normal file
View File

@@ -0,0 +1,495 @@
---
language:
- pt
license: apache-2.0
library_name: transformers
tags:
- text-generation-inference
datasets:
- Polygl0t/gigaverbo-v2
- Polygl0t/gigaverbo-v2-synth
metrics:
- perplexity
pipeline_tag: text-generation
widget:
- text: "A floresta da Amazônia é conhecida por sua"
example_title: Exemplo
- text: "Uma das coisas que Portugal, Angola, Brasil e Moçambique tem em comum é o"
example_title: Exemplo
- text: "O Carnaval do Rio de Janeiro é"
example_title: Exemplo
inference:
parameters:
repetition_penalty: 1.2
temperature: 0.1
top_k: 50
top_p: 1.0
max_new_tokens: 150
co2_eq_emissions:
emissions: 86000
source: CodeCarbon
training_type: pre-training
geographical_location: Germany
hardware_used: NVIDIA A100-SXM4-80GB
model-index:
- name: Tucano2-qwen-3.7B-Base
results:
- task:
type: text-generation
name: Text Generation
dataset:
name: ARC Challenge
type: Polygl0t/ARC-poly
split: test
args:
num_few_shot: 5
metrics:
- type: acc_norm
value: 57.78
name: Acc-norm
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: arc_challenge_poly_pt
- task:
type: text-generation
name: Text Generation
dataset:
name: HellaSwag
type: Polygl0t/HellaSwag-poly
split: validation
args:
num_few_shot: 5
metrics:
- type: acc_norm
value: 65.32
name: Acc-norm
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: hellaswag_poly_pt
- task:
type: text-generation
name: Text Generation
dataset:
name: Calame
type: Polygl0t/CALAME-PT
split: test
args:
num_few_shot: 5
metrics:
- type: acc
value: 61.08
name: Acc
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: calame_pt
- task:
type: text-generation
name: Text Generation
dataset:
name: Lambada
type: Polygl0t/LAMBADA-poly
split: test
args:
num_few_shot: 5
metrics:
- type: acc
value: 62.53
name: Acc
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: lambada_pt
- task:
type: text-generation
name: Text Generation
dataset:
name: Global PIQA
type: mrlbenchmarks/global-piqa-nonparallel
split: test
args:
num_few_shot: 5
metrics:
- type: acc_norm
value: 83
name: Acc-norm
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: global_piqa_completions_por_latn_braz
- task:
type: text-generation
name: Text Generation
dataset:
name: MMLU
type: Polygl0t/MMLU-poly
split: test
args:
num_few_shot: 5
metrics:
- type: acc
value: 65.4
name: Acc
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: mmlu_poly_pt
- task:
type: text-generation
name: Text Generation
dataset:
name: BELEBELE
type: facebook/belebele
split: test
args:
num_few_shot: 5
metrics:
- type: acc_norm
value: 83.67
name: Acc-norm
source:
url: https://github.com/Polygl0t/lm-evaluation-harness/tree/polyglot_harness_portuguese
name: belebele_por_Latn
- task:
type: text-generation
name: Text Generation
dataset:
name: BLUEX
type: eduagarcia-temp/BLUEX_without_images
split: train
args:
num_few_shot: 3
metrics:
- type: acc
value: 66.2
name: Acc
source:
url: https://github.com/eduagarcia/lm-evaluation-harness-pt
name: bluex
- task:
type: text-generation
name: Text Generation
dataset:
name: ENEM Challenge
type: eduagarcia/enem_challenge
split: train
args:
num_few_shot: 3
metrics:
- type: acc
value: 77.54
name: Acc
source:
url: https://github.com/eduagarcia/lm-evaluation-harness-pt
name: enem_challenge
- task:
type: text-generation
name: Text Generation
dataset:
name: OAB Exams
type: eduagarcia/oab_exams
split: train
args:
num_few_shot: 3
metrics:
- type: acc
value: 58.45
name: Acc
source:
url: https://github.com/eduagarcia/lm-evaluation-harness-pt
name: oab_exams
base_model: Qwen/Qwen3-4B-Base
---
# Tucano2-qwen-3.7B-Base
<img src="./logo.png" alt="An illustration of a Tucano bird showing vibrant colors like yellow, orange, blue, green, and black." height="200">
## Model Summary
**[Tucano2-qwen-3.7B-Base](https://huggingface.co/Polygl0t/Tucano2-qwen-3.7B-Base)** is a decoder-only transformer continually pretrained from [Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base). Tucano2 is part of the [Polygl0t](https://huggingface.co/Polygl0t) initiative, which aims to advance language models for low-resource languages.
Tucano2-qwen-3.7B-Base shares the same tokenizer as **[Tucano2-0.6B-Base](https://huggingface.co/Polygl0t/Tucano2-0.6B-Base)**. Token embedding transplantation via _Orthogonal Matching Pursuit_ was used to adapt Qwen3-4B-Base to be more sensitive to the lexical, morphological, and orthographic properties of Portuguese.
The model was continually pretrained on approximately 50 billion tokens and achieves state-of-the-art performance across several benchmarks designed to evaluate Portuguese language models. **All data, source code, and recipes used to develop the Tucano2 series are open and fully reproducible.**
## Details
- **Architecture:** a Transformer-based model ([`qwen3`](https://huggingface.co/docs/transformers/main/en/model_doc/qwen3))
- **Size:** 3,759,341,056 parameters
- **Context length:** 4,096 tokens
- **Dataset(s):**
- [Polygl0t/gigaverbo-v2](https://huggingface.co/datasets/Polygl0t/gigaverbo-v2)
- [Polygl0t/gigaverbo-v2-synth](https://huggingface.co/datasets/Polygl0t/gigaverbo-v2-synth)
- **Language(s):** Portuguese
- **Batch size:** 1,048,576 tokens
- **Number of steps:** 50,000
- **GPU:** 8 NVIDIA A100-SXM4-80GB
- **Training time**: ~ 303 hours
- **Emissions:** 466 KgCO2 (Germany)
- **Total energy consumption:** 1223 kWh
This repository has the [source code](https://github.com/Polygl0t/llm-foundry) used to train this model. The full configuration used for training is available in the following config file:
- Single stage (linear warmup with cosine decay): [training_config.yaml](training_config.yaml)
### Checkpoints
Checkpoints were saved every 2,500 steps, which equates to approximately 2.5 billion tokens. The main branch of this repository contains the final checkpoint saved at step 50000. All other checkpoints are available as separate branches. To load a specific checkpoint, you can use the following code snippet:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "Polygl0t/Tucano2-qwen-3.7B-Base"
revision = "step-2500" # Change this to the desired checkpoint branch
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, revision=revision)
```
Or, you can access all the revisions for the models via the following code snippet:
```python
from huggingface_hub import list_repo_refs
out = list_repo_refs("Polygl0t/Tucano2-qwen-3.7B-Base")
branches = [b.name for b in out.branches]
print(branches)
```
<details>
<summary><b>Learning Curves</b></summary>
![Learning Curves](./.plots/learning_curve.png)
This plot illustrates the evolution of model performance (measured by loss) as a function of training time, measured in tokens seen during training
</details>
<details>
<summary><b>Gradient Norms (L2)</b></summary>
![Gradient Norms](./.plots/gradient_norm.png)
This plot illustrates the evolution of gradient norms as a function of training time, measured in tokens seen during training.
</details>
## Intended Uses
The primary intended use of Tucano2-qwen-3.7B-Base is to serve as a foundation for research and development involving Portuguese language modeling. Checkpoints saved during training are designed to provide a controlled setting for performing comparative experiments, specifically regarding the effects of continual pretraining on the performance of currently available benchmarks. You may also fine-tune and adapt Tucano2-qwen-3.7B-Base for deployment if your use follows the Apache 2.0 license. If you decide to use Tucano2-qwen-3.7B-Base as a basis for your fine-tuned model, please conduct your own risk and bias assessment.
## Out-of-scope Use
- Tucano2-qwen-3.7B-Base is **not intended for deployment**. It is not an out-of-the-box product and should not be used for human-facing interactions.
- Tucano2-qwen-3.7B-Base is for **the Portuguese language only** and is unsuitable for text generation tasks in other languages.
- Tucano2-qwen-3.7B-Base has **not been fine-tuned** for downstream tasks.
## Basic usage
```python
from transformers import GenerationConfig, TextGenerationPipeline, AutoTokenizer, AutoModelForCausalLM
import torch
# Specify the model and tokenizer
model_id = "Polygl0t/Tucano2-qwen-3.7B-Base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
# Specify the generation parameters as you like
generation_config = GenerationConfig(
**{
"do_sample": True,
"max_new_tokens": 150,
"renormalize_logits": True,
"repetition_penalty": 1.2,
"temperature": 0.1,
"top_k": 50,
"top_p": 1.0,
"use_cache": True,
}
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = TextGenerationPipeline(model=model, task="text-generation", tokenizer=tokenizer, device=device)
# Generate text
prompt = "# A floresta da Amazônia: um lugar de Magia\n\n"
completion = generator(prompt, generation_config=generation_config)
print(completion[0]['generated_text'])
```
## Limitations
As almost all other language models trained on large text datasets scraped from the web, the Tucano2-qwen-3.7B-Base shows behavior that does not make it an out-of-the-box solution to many real-world applications, especially those requiring factual, reliable, and nontoxic text generation. Tucano2-qwen-3.7B-Base is subject to the following:
- **Hallucinations:** Tucano2-qwen-3.7B-Base can produce content that can be mistaken as facts, but is misleading or entirely false, i.e., hallucinations.
- **Biases and Toxicity:** Tucano2-qwen-3.7B-Base inherits the social and historical stereotypes from the data used to train it. Given these biases, the model can produce toxic content, i.e., harmful, offensive, or detrimental to individuals, groups, or communities.
- **Language Limitations:** Tucano2-qwen-3.7B-Base is primarily designed to interact with Portuguese. Other languages might challenge its comprehension, leading to potential misinterpretations or errors in response.
- **Repetition and Verbosity:** Tucano2-qwen-3.7B-Base may get stuck on repetition loops (especially if the repetition penalty during generations is set to a meager value) or produce verbose responses unrelated to the prompt it was given.
Hence, even though Tucano2-qwen-3.7B-Base is released under a permissive license, we urge users to perform their own risk analysis before using it for real-world applications.
## Evaluations
The table below compares the Tucano2 series against other base models of similar size. We divide our evaluations into two sets:
- **Easy Set**: CALAME, GlobalPIQA, LAMBADA, ARC-Challenge, HellaSwag
- **Hard Set**: ENEM, BLUEX, OAB Exams, BELEBELE, MMLU
The NPM (Normalized Performance Metric) provides a balanced view of model performance across tasks, accounting for each task's inherent difficulty by normalizing its evaluation score relative to its random baseline.
| | Total Avg. | Easy Set (NPM) | Hard Set (NPM) |
| -------------------------- | ---------- | -------------- | -------------- |
| **Tucano2-qwen-3.7B-Base** | 59.21 | 57.41 | 61 |
| Qwen2.5-7B | 57.97 | 54.12 | 61.83 |
| Qwen3-4B-Base | 57.86 | 52.52 | 63.2 |
| SmolLM3-3B-Base | 50.25 | 54.06 | 46.44 |
| Qwen2.5-3B | 50.16 | 47.69 | 52.62 |
| **Tucano2-qwen-1.5B-Base** | 47.9 | 47.97 | 47.82 |
| Curio-edu-7b | 45.66 | 57.46 | 33.87 |
| Qwen3-1.7B-Base | 44.48 | 40.94 | 48.03 |
| Curio-7b | 42.79 | 58.97 | 26.6 |
| Llama-3.2-3B | 40.5 | 43.79 | 37.21 |
| granite-3.3-2b-base | 39.97 | 45.31 | 34.63 |
| **Tucano2-qwen-0.5B-Base** | 35.36 | 39.93 | 30.79 |
| Qwen3-0.6B-Base | 29.4 | 26.41 | 32.38 |
| Llama-2-7b-hf | 29.36 | 42.69 | 16.03 |
| **Tucano2-0.6B-Base** | 20.64 | 40.28 | 0.99 |
| Qwen2.5-0.5B | 19.89 | 18.7 | 21.09 |
| Curio-1.1b | 19.23 | 39.16 | -0.69 |
| Tucano-2b4 | 17.88 | 33.55 | 2.2 |
| Curio-edu-1b1 | 17.72 | 34.77 | 0.67 |
| Llama-3.2-1B | 16.57 | 28.32 | 4.83 |
| Tucano-1b1 | 15.44 | 29.12 | 1.76 |
| Tucano-630m | 14.9 | 26.99 | 2.8 |
| Carvalho_pt-gl-1.3B | 12.54 | 26.75 | -1.66 |
| TeenyTinyLlama-460m | 11.18 | 19.65 | 2.72 |
| Tucano-160m | 8.78 | 19.12 | -1.56 |
| TeenyTinyLlama-160m | 7.72 | 15.75 | -0.31 |
| GlorIA-1.3B | 5.93 | 27.27 | -15.42 |
<details>
<summary><b>Evaluation Suite</b></summary>
| **Benchmark** | **n-shot** | **Type** | **Baseline** | **Metric** |
| --------------- | ---------- | ------------------ | ------------ | ---------- |
| **Easy Set** | | | | |
| CALAME | 5-shot | Completion | 0 | `acc` |
| GlobalPIQA | 5-shot | Completion | 50 | `acc_norm` |
| LAMBADA | 5-shot | Completion | 0 | `acc` |
| ARC-Challenge | 5-shot | MC-Q&A | 25 | `acc_norm` |
| HellaSwag | 5-shot | Completion | 25 | `acc_norm` |
| **Hard Set** | | | | |
| ENEM           | 3-shot     | MC-Q&A             | 20           | `acc` |
| BLUEX           | 3-shot     | MC-Q&A             | 22.5         | `acc` |
| OAB Exams       | 3-shot     | MC-Q&A             | 25           | `acc` |
| BELEBELE | 5-shot | MC-Q&A | 25 | `acc_norm` |
| MMLU | 5-shot | MC-Q&A | 25 | `acc` |
</details>
<details>
<summary><b>Individual Benchmarks</b></summary>
| | BLUEX | ENEM | OAB | ARC Challenge | BELEBELE | CALAME | Global PIQA | HellaSwag | LAMBADA | MMLU |
| -------------------------- | ----- | ----- | ----- | ------------- | -------- | ------ | ----------- | --------- | ------- | ----- |
| **Tucano2-qwen-3.7B-Base** | 66.2 | 77.54 | 58.45 | 57.78 | 83.67 | 61.08 | 83 | 65.32 | 62.53 | 65.4 |
| Qwen2.5-7B | 65.92 | 75.02 | 55.03 | 54.19 | 89.67 | 58.96 | 78 | 67.92 | 59.52 | 68.55 |
| Qwen3-4B-Base | 69.96 | 77.61 | 55.58 | 54.53 | 87.89 | 57.95 | 77 | 63.19 | 60.37 | 68.59 |
| SmolLM3-3B-Base | 54.52 | 61.37 | 45.51 | 51.37 | 77.67 | 59.15 | 81 | 65.57 | 59.89 | 56.19 |
| Qwen2.5-3B | 58.28 | 67.32 | 50.34 | 45.21 | 83.22 | 58.38 | 75 | 59.44 | 57.17 | 59.79 |
| **Tucano2-qwen-1.5B-Base** | 55.91 | 68.72 | 48.29 | 48.21 | 74 | 59.06 | 77 | 56.25 | 54.2 | 54.04 |
| Curio-edu-7b | 47.15 | 58.64 | 43.78 | 50.94 | 53 | 60.79 | 86 | 66.48 | 64.62 | 45.14 |
| Qwen3-1.7B-Base | 57.16 | 65.22 | 45.79 | 47.18 | 77.89 | 53.56 | 67 | 52.55 | 50.81 | 55.49 |
| Curio-7b | 43.39 | 50.59 | 39.68 | 48.03 | 45.33 | 63.44 | 89 | 67.58 | 65.94 | 40.83 |
| Llama-3.2-3B | 50.35 | 53.04 | 39.45 | 41.11 | 68.89 | 54.48 | 69 | 59.14 | 59.48 | 48.28 |
| granite-3.3-2b-base | 45.34 | 54.02 | 39.54 | 41.37 | 65.67 | 58.77 | 70 | 60.81 | 58.22 | 45.63 |
| **Tucano2-qwen-0.5B-Base** | 46.87 | 55.14 | 40.36 | 37.44 | 53.89 | 58.67 | 74 | 48.43 | 45.14 | 39.68 |
| Qwen3-0.6B-Base | 42.98 | 49.48 | 40.46 | 36.92 | 65 | 45.95 | 54 | 40.33 | 41.78 | 43.54 |
| Llama-2-7b-hf | 31.29 | 31.77 | 35.49 | 42.14 | 41.44 | 54.53 | 67 | 56.76 | 59.73 | 38.64 |
| **Tucano2-0.6B-Base** | 21.14 | 23.58 | 23.28 | 37.01 | 26.22 | 57.61 | 79 | 47.74 | 39.45 | 27.18 |
| Qwen2.5-0.5B | 32.55 | 38.91 | 35.9 | 28.46 | 49.56 | 44.89 | 44 | 37.7 | 39.08 | 41.17 |
| Curio-1.1b | 21.56 | 21.06 | 23.1 | 30.43 | 22.89 | 59.25 | 75 | 49.45 | 46.69 | 26.35 |
| Tucano-2b4 | 25.45 | 21.62 | 26.74 | 30.43 | 25.89 | 50.34 | 73 | 48.85 | 32.39 | 26.24 |
| Curio-edu-1b1 | 23.5 | 19.87 | 25.01 | 32.22 | 26.22 | 54.91 | 69 | 46.3 | 42.93 | 25.43 |
| Llama-3.2-1B | 24.06 | 23.93 | 26.06 | 31.71 | 33.33 | 50 | 55 | 45.27 | 45.6 | 28.51 |
| Tucano-1b1 | 25.45 | 21.55 | 26.38 | 30.09 | 25.67 | 48.94 | 68 | 44.1 | 28.43 | 25.26 |
| Tucano-630m | 26.7 | 21.69 | 26.92 | 28.72 | 27.33 | 47.3 | 68 | 40.37 | 26.2 | 25.6 |
| Carvalho_pt-gl-1.3B | 19.33 | 18.12 | 22.32 | 27.01 | 26.44 | 53.42 | 63 | 38.53 | 33.59 | 24.82 |
| TeenyTinyLlama-460m | 25.87 | 20.15 | 27.02 | 27.35 | 28.11 | 42.49 | 59 | 34.81 | 21.56 | 26.65 |
| Tucano-160m | 24.76 | 20.57 | 17.22 | 25.56 | 23.44 | 43.59 | 59 | 33.73 | 21.64 | 25.77 |
| TeenyTinyLlama-160m | 22.53 | 18.89 | 22.32 | 24.02 | 26.78 | 39.79 | 58 | 29.89 | 17.74 | 25.74 |
| GlorIA-1.3B | 4.31 | 2.52 | 4.69 | 26.41 | 22.78 | 54.67 | 64 | 36.35 | 36.68 | 23.69 |
</details>
## Performance and Compute
Below, we display the performance of Tucano2-qwen-3.7B-Base across all benchmarks in our evaluation suite. Tucano2-qwen-3.7B-Base is compared with Qwen3-4B-Base, the base model from which they were continually pretrained. The percentage variation in performance is displayed in terms of the difference in evaluation scores between the Base and the Continually Pretrained model.
All individual benchmark scores and their evolution across training time can be found in the [.plots](https://huggingface.co/Polygl0t/Tucano2-qwen-3.7B-Base/tree/main/.plots/) folder.
**Before and After Continual Pretraining**
![Performance Before and After Continual Pretraining](./.plots/before_and_after.png)
This plot compares the compute requirements (measured as C = 6 \* N \* D, where N is the number of parameters and D is the number of tokens processed) against the performance of each model (measured by the NPM score).
![NPM vs Compute](./.plots/performance_vs_compute.png)
<details>
<summary><b>Performance and Compute Details</b></summary>
| | Parameters (B) | Pretraining Tokens (B) | Continual Pretraining Tokens (B) | Total Tokens (B) | Pretraining Compute (FLOPs) | Continual Pretraining Compute (FLOPs) | Total Compute (FLOPs) | NPM Score |
|----------------------------|----------------|------------------------|----------------------------------|------------------|-----------------------------|---------------------------------------|-----------------------|-----------|
| **Tucano2-qwen-3.7B-Base** | 3.7 | 36000 | 50 | 36050 | 8.64e+23 | 1.11e+21 | 8.65e+23 | 59.2 |
| Qwen2.5-7B | 7 | 18000 | - | 18000 | 7.56e+23 | - | 7.56e+23 | 57.97 |
| Qwen3-4B-Base | 4 | 36000 | - | 36000 | 8.64e+23 | - | 8.64e+23 | 57.86 |
| SmolLM3-3B-Base | 3 | 11200 | - | 11200 | 2.02e+23 | - | 2.02e+23 | 50.25 |
| Qwen2.5-3B | 3 | 18000 | - | 18000 | 3.24e+23 | - | 3.24e+23 | 50.15 |
| **Tucano2-qwen-1.5B-Base** | 1.5 | 36000 | 100 | 36100 | 3.67e+23 | 9e+20 | 3.68e+23 | 47.89 |
| Curio-edu-7b | 7 | 2000 | 20 | 2020 | 8.4e+22 | 8.4e+20 | 8.48e+22 | 45.66 |
| Qwen3-1.7B-Base | 1.7 | 36000 | - | 36000 | 3.67e+23 | - | 3.67e+23 | 44.48 |
| Curio-7b | 7 | 2000 | 150 | 2150 | 8.4e+22 | 6.3e+21 | 9.03e+22 | 42.78 |
| Llama-3.2-3B | 3 | 9000 | - | 9000 | 1.62e+23 | - | 1.62e+23 | 40.5 |
| granite-3.3-2b-base | 2 | 12000 | - | 12000 | 1.44e+23 | - | 1.44e+23 | 39.96 |
| **Tucano2-qwen-0.5B-Base** | 0.5 | 36000 | 50 | 36050 | 1.3e+23 | 1.5e+20 | 1.3e+23 | 35.35 |
| Qwen3-0.6B-Base | 0.6 | 36000 | - | 36000 | 1.3e+23 | - | 1.3e+23 | 29.39 |
| Llama-2-7b-hf | 7 | 2000 | - | 2000 | 8.4e+22 | - | 8.4e+22 | 29.36 |
| **Tucano2-0.6B-Base** | 0.6 | 408 | - | 408 | 1.47e+21 | - | 1.47e+21 | 20.63 |
| Qwen2.5-0.5B | 0.5 | 18000 | - | 18000 | 5.4e+22 | - | 5.4e+22 | 19.89 |
| Curio-1.1b | 1.1 | 1000 | 150 | 1150 | 6.6e+21 | 9.9e+20 | 7.59e+21 | 19.23 |
| Tucano-2b4 | 2.4 | 515 | - | 515 | 7.42e+21 | - | 7.42e+21 | 17.87 |
| Curio-edu-1b1 | 1.1 | 1000 | 20 | 1020 | 6.6e+21 | 1.32e+20 | 6.73e+21 | 17.72 |
| Llama-3.2-1B | 1 | 9000 | - | 9000 | 5.4e+22 | - | 5.4e+22 | 16.57 |
| Tucano-1b1 | 1.1 | 250 | - | 250 | 1.65e+21 | - | 1.65e+21 | 15.44 |
| Tucano-630m | 0.63 | 211 | - | 211 | 7.98e+20 | - | 7.98e+20 | 14.89 |
| Carvalho_pt-gl-1.3B | 1.3 | 26 | 5 | 31 | 2.03e+20 | 3.9e+19 | 2.42e+20 | 12.54 |
| TeenyTinyLlama-460m | 0.46 | 6.2 | - | 6.2 | 1.71e+19 | - | 1.71e+19 | 11.18 |
| Tucano-160m | 0.16 | 169 | - | 169 | 1.62e+20 | - | 1.62e+20 | 8.78 |
| TeenyTinyLlama-160m | 0.16 | 6.2 | - | 6.2 | 5.95e+18 | - | 5.95e+18 | 7.71 |
| GlorIA-1.3B | 1.3 | 35 | - | 35 | 2.73e+20 | - | 2.73e+20 | 5.92 |
</details>
## Cite as 🤗
```latex
@misc{correa2026tucano2cool,
title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}},
author={Nicholas Kluge Corr{\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek},
year={2026},
eprint={2603.03543},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2603.03543},
}
```
## Aknowlegments
Polyglot is a project funded by the Federal Ministry of Education and Research (BMBF) and the Ministry of Culture and Science of the State of North Rhine-Westphalia (MWK) as part of TRA Sustainable Futures (University of Bonn) and the Excellence Strategy of the federal and state governments.
We also gratefully acknowledge the granted access to the [Marvin cluster](https://www.hpc.uni-bonn.de/en/systems/marvin) hosted by [University of Bonn](https://www.uni-bonn.de/en) along with the support provided by its High Performance Computing & Analytics Lab.
## License
Tucano2-qwen-3.7B-Base is licensed under the Apache License, Version 2.0. For more details, see the [LICENSE](LICENSE) file.

70
config.json Normal file
View File

@@ -0,0 +1,70 @@
{
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 1,
"dtype": "bfloat16",
"eos_token_id": 2,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 2560,
"initializer_range": 0.02,
"intermediate_size": 9728,
"layer_types": [
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention"
],
"max_position_embeddings": 4096,
"max_window_layers": 36,
"model_type": "qwen3",
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"pad_token_id": 49109,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.53.2",
"use_cache": false,
"use_sliding_window": false,
"vocab_size": 49152
}

23
emissions.csv Normal file
View File

@@ -0,0 +1,23 @@
timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue
2026-01-15T18:36:21,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,2468.154875099659,0.5195039106304847,0.0002104827034444,45.027290925,365.33355990502815,70.0,0.0298295135649475,1.287542577533486,0.0463341974708924,1.363706288569326,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-16T09:06:58,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54704.4767507948,11.595711822278624,0.0002119700710254,45.02626823769232,364.31659250401066,70.0,0.6612322660173141,28.75075110169371,1.0269504091064288,30.438933776817496,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-17T00:14:10,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,109136.3574270364,23.13841512055636,0.0002120138115845,45.033781860000005,366.0656712431605,70.0,1.319176990428561,57.37075434434462,2.048787973683132,60.7387193084561,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-17T15:21:21,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,163567.73299597765,34.67404203491055,0.000211985832412,45.02352758400001,365.6402060708027,70.0,1.9771126242558168,85.97220533993158,3.0706112520106603,91.01992921619778,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-18T06:28:30,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,217997.1265374664,46.20552631578055,0.0002119547493569,45.02980962,368.02914726819273,70.0,2.635007360849327,114.56288697912528,4.092370303104284,121.29026464307798,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-18T21:35:36,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,272423.1234841924,57.7453517586373,0.0002119693476093,45.023594388750006,366.0163927822049,70.0,3.292879428269992,143.1755286803314,5.114087648420479,151.58249575702138,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-19T12:42:46,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,326852.9779706467,69.2843478887053,0.0002119740450855,45.019291527,365.1986956207545,70.0,3.9508097393555017,171.78584064300725,6.135899528926718,181.8725499112884,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-20T03:49:46,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,381273.3088904135,80.82208866270898,0.0002119794036931,45.02813139346154,366.5763535291482,70.0,4.608630381110712,200.3931514191152,7.157526935066201,212.1593087352907,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-20T18:56:48,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,435694.7253034543,92.36224746237993,0.0002119884453456,45.02399466681818,365.5611996992909,70.0,5.266440596559091,229.00681658280672,8.179157736976183,242.45241491634053,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-21T10:03:57,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,490123.6352933403,103.90027516583852,0.0002119878897569,45.0380538,362.73561379820666,70.0,5.924347015764262,257.6146445718286,9.200935347279996,272.73992693486946,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-22T01:10:57,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,544543.8373416401,115.4391691490337,0.000211992426014,45.025143291,364.764783523692,70.0,6.582147076467519,286.2250175984421,10.222548277012152,303.0297129519194,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-22T16:22:42,Polyglot,a98dcd1d-81e8-4c52-b9ba-98152119eb2c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,599248.5571675301,127.00066561859128,0.0002119332021738,45.022997385,365.56758633067534,70.0,7.243476923541371,314.88571718170573,11.24963676124982,333.37883086649504,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-23T12:45:51,Polyglot,2484ccec-2100-4c4c-b6e6-435ed2c585ba,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54613.77945225686,11.550126398496984,0.0002114874032586,45.03137619681819,364.7858089641286,70.0,0.6601663593915156,28.63382480287487,1.0252801292337537,30.31927129150016,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-24T03:53:26,Polyglot,2484ccec-2100-4c4c-b6e6-435ed2c585ba,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,109069.18703039364,23.095515196498937,0.0002117510529354,45.02650497576924,365.02014175589295,70.0,1.3184111013187851,57.260129715011885,2.0475654876437246,60.62610630397411,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-24T19:00:52,Polyglot,2484ccec-2100-4c4c-b6e6-435ed2c585ba,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,163514.35428753495,34.6420400613965,0.0002118593209283,45.0275499225,363.8928029652876,70.0,1.97649903696516,85.88980653234592,3.069617941376579,90.93592351068774,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-25T10:08:17,Polyglot,2484ccec-2100-4c4c-b6e6-435ed2c585ba,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,217959.6767143328,46.19001499252549,0.0002119200013911,45.0285341535,364.1332683805725,70.0,2.634615922478797,114.52321929127991,4.091711951290329,121.2495471650492,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-26T12:55:19,Polyglot,2056727d-5a23-4012-8dc1-d9ed973eafe3,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54645.26329020783,11.564505259260228,0.0002116286858724,45.02202440625,366.6509312989584,70.0,0.6605413594542934,28.67061471564012,1.0258599618138646,30.35701603690833,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-28T09:16:38,Polyglot,44d02a14-b9c1-4b54-96eb-262e07ddb7aa,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54667.189970373,12.630934188028013,0.0002310514624013,225.0,364.96770478698727,70.0,3.4129260209240755,28.681692707280227,1.0617907429283409,33.156409471132726,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-01-29T00:24:39,Polyglot,44d02a14-b9c1-4b54-96eb-262e07ddb7aa,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,109148.035300284,25.27580870613449,0.0002315736479964,225.0,366.0242192060592,70.0,6.8141945241443445,57.41526349217393,2.119954521113271,66.3494125374314,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-02-02T11:01:39,Polyglot,a7a5d34d-5890-4e6c-8818-fee6be214e64,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54529.717731854005,11.516054625936697,0.0002111885977948,45.00559872,370.8932572897432,70.0,0.6591468596468995,28.54703232066313,1.023653149225791,30.229832329535885,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-02-03T04:10:53,Polyglot,58b3e182-5987-4f0f-90c8-34d6a56e3290,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54836.62605299696,11.529045078499776,0.0002102435162104,45.031022775,364.51114155448056,70.0,0.6628703193037179,28.57158590252822,1.0294762588080826,30.26393248063992,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
2026-02-03T04:11:15,Polyglot,58b3e182-5987-4f0f-90c8-34d6a56e3290,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,54858.70311053004,11.533777165940318,0.00021024516643606961,45.000000405,475.9117209547019,70.0,0.6633147701490933,28.58287248572414,1.0301670319345049,30.276354287807635,Germany,DEU,north rhine-westphalia,,,Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34,3.12.3,3.0.6,256,AMD EPYC 7713 64-Core Processor,4,4 x NVIDIA A100-SXM4-80GB,7.0932,50.7263,1950,machine,N,1.0
1 timestamp project_name run_id experiment_id duration emissions emissions_rate cpu_power gpu_power ram_power cpu_energy gpu_energy ram_energy energy_consumed country_name country_iso_code region cloud_provider cloud_region os python_version codecarbon_version cpu_count cpu_model gpu_count gpu_model longitude latitude ram_total_size tracking_mode on_cloud pue
2 2026-01-15T18:36:21 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 2468.154875099659 0.5195039106304847 0.0002104827034444 45.027290925 365.33355990502815 70.0 0.0298295135649475 1.287542577533486 0.0463341974708924 1.363706288569326 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
3 2026-01-16T09:06:58 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54704.4767507948 11.595711822278624 0.0002119700710254 45.02626823769232 364.31659250401066 70.0 0.6612322660173141 28.75075110169371 1.0269504091064288 30.438933776817496 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
4 2026-01-17T00:14:10 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 109136.3574270364 23.13841512055636 0.0002120138115845 45.033781860000005 366.0656712431605 70.0 1.319176990428561 57.37075434434462 2.048787973683132 60.7387193084561 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
5 2026-01-17T15:21:21 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 163567.73299597765 34.67404203491055 0.000211985832412 45.02352758400001 365.6402060708027 70.0 1.9771126242558168 85.97220533993158 3.0706112520106603 91.01992921619778 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
6 2026-01-18T06:28:30 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 217997.1265374664 46.20552631578055 0.0002119547493569 45.02980962 368.02914726819273 70.0 2.635007360849327 114.56288697912528 4.092370303104284 121.29026464307798 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
7 2026-01-18T21:35:36 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 272423.1234841924 57.7453517586373 0.0002119693476093 45.023594388750006 366.0163927822049 70.0 3.292879428269992 143.1755286803314 5.114087648420479 151.58249575702138 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
8 2026-01-19T12:42:46 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 326852.9779706467 69.2843478887053 0.0002119740450855 45.019291527 365.1986956207545 70.0 3.9508097393555017 171.78584064300725 6.135899528926718 181.8725499112884 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
9 2026-01-20T03:49:46 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 381273.3088904135 80.82208866270898 0.0002119794036931 45.02813139346154 366.5763535291482 70.0 4.608630381110712 200.3931514191152 7.157526935066201 212.1593087352907 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
10 2026-01-20T18:56:48 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 435694.7253034543 92.36224746237993 0.0002119884453456 45.02399466681818 365.5611996992909 70.0 5.266440596559091 229.00681658280672 8.179157736976183 242.45241491634053 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
11 2026-01-21T10:03:57 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 490123.6352933403 103.90027516583852 0.0002119878897569 45.0380538 362.73561379820666 70.0 5.924347015764262 257.6146445718286 9.200935347279996 272.73992693486946 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
12 2026-01-22T01:10:57 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 544543.8373416401 115.4391691490337 0.000211992426014 45.025143291 364.764783523692 70.0 6.582147076467519 286.2250175984421 10.222548277012152 303.0297129519194 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
13 2026-01-22T16:22:42 Polyglot a98dcd1d-81e8-4c52-b9ba-98152119eb2c 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 599248.5571675301 127.00066561859128 0.0002119332021738 45.022997385 365.56758633067534 70.0 7.243476923541371 314.88571718170573 11.24963676124982 333.37883086649504 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
14 2026-01-23T12:45:51 Polyglot 2484ccec-2100-4c4c-b6e6-435ed2c585ba 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54613.77945225686 11.550126398496984 0.0002114874032586 45.03137619681819 364.7858089641286 70.0 0.6601663593915156 28.63382480287487 1.0252801292337537 30.31927129150016 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
15 2026-01-24T03:53:26 Polyglot 2484ccec-2100-4c4c-b6e6-435ed2c585ba 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 109069.18703039364 23.095515196498937 0.0002117510529354 45.02650497576924 365.02014175589295 70.0 1.3184111013187851 57.260129715011885 2.0475654876437246 60.62610630397411 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
16 2026-01-24T19:00:52 Polyglot 2484ccec-2100-4c4c-b6e6-435ed2c585ba 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 163514.35428753495 34.6420400613965 0.0002118593209283 45.0275499225 363.8928029652876 70.0 1.97649903696516 85.88980653234592 3.069617941376579 90.93592351068774 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
17 2026-01-25T10:08:17 Polyglot 2484ccec-2100-4c4c-b6e6-435ed2c585ba 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 217959.6767143328 46.19001499252549 0.0002119200013911 45.0285341535 364.1332683805725 70.0 2.634615922478797 114.52321929127991 4.091711951290329 121.2495471650492 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
18 2026-01-26T12:55:19 Polyglot 2056727d-5a23-4012-8dc1-d9ed973eafe3 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54645.26329020783 11.564505259260228 0.0002116286858724 45.02202440625 366.6509312989584 70.0 0.6605413594542934 28.67061471564012 1.0258599618138646 30.35701603690833 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
19 2026-01-28T09:16:38 Polyglot 44d02a14-b9c1-4b54-96eb-262e07ddb7aa 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54667.189970373 12.630934188028013 0.0002310514624013 225.0 364.96770478698727 70.0 3.4129260209240755 28.681692707280227 1.0617907429283409 33.156409471132726 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
20 2026-01-29T00:24:39 Polyglot 44d02a14-b9c1-4b54-96eb-262e07ddb7aa 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 109148.035300284 25.27580870613449 0.0002315736479964 225.0 366.0242192060592 70.0 6.8141945241443445 57.41526349217393 2.119954521113271 66.3494125374314 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
21 2026-02-02T11:01:39 Polyglot a7a5d34d-5890-4e6c-8818-fee6be214e64 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54529.717731854005 11.516054625936697 0.0002111885977948 45.00559872 370.8932572897432 70.0 0.6591468596468995 28.54703232066313 1.023653149225791 30.229832329535885 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
22 2026-02-03T04:10:53 Polyglot 58b3e182-5987-4f0f-90c8-34d6a56e3290 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54836.62605299696 11.529045078499776 0.0002102435162104 45.031022775 364.51114155448056 70.0 0.6628703193037179 28.57158590252822 1.0294762588080826 30.26393248063992 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0
23 2026-02-03T04:11:15 Polyglot 58b3e182-5987-4f0f-90c8-34d6a56e3290 5b0fa12a-3dd7-45bb-9766-cc326314d9f1 54858.70311053004 11.533777165940318 0.00021024516643606961 45.000000405 475.9117209547019 70.0 0.6633147701490933 28.58287248572414 1.0301670319345049 30.276354287807635 Germany DEU north rhine-westphalia Linux-5.14.0-570.35.1.el9_6.x86_64-x86_64-with-glibc2.34 3.12.3 3.0.6 256 AMD EPYC 7713 64-Core Processor 4 4 x NVIDIA A100-SXM4-80GB 7.0932 50.7263 1950 machine N 1.0

190
evals.yaml Normal file
View File

@@ -0,0 +1,190 @@
evaluations:
arc_challenge_poly_pt_acc: 0.5230769230769231
arc_challenge_poly_pt_acc_norm: 0.5777777777777777
arc_challenge_poly_pt_acc_norm_stderr: 0.014445870094078068
arc_challenge_poly_pt_acc_stderr: 0.014608300475750825
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.9252450980392157
assin2_rte_acc_stderr,all: 0.0037560275279046665
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.9251590635527494
assin2_rte_f1_macro_stderr,all: 0.0037610728425282497
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 0.5572916666666665
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.7701197353926412
assin2_sts_pearson_stderr,all: 0.006649667590414615
assin_entailment_acc: 0.704
assin_entailment_acc_stderr: 0.00721865827261647
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.694
assin_paraphrase_acc_stderr: 0.007287268079947193
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.8366666666666667
belebele_por_Latn_acc_norm: 0.8366666666666667
belebele_por_Latn_acc_norm_stderr: 0.012329168844652528
belebele_por_Latn_acc_stderr: 0.012329168844652528
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.6620305980528511
bluex_acc,exam_id__UNICAMP_2018: 0.6481481481481481
bluex_acc,exam_id__UNICAMP_2019: 0.64
bluex_acc,exam_id__UNICAMP_2020: 0.6909090909090909
bluex_acc,exam_id__UNICAMP_2021_1: 0.6521739130434783
bluex_acc,exam_id__UNICAMP_2021_2: 0.5882352941176471
bluex_acc,exam_id__UNICAMP_2022: 0.6666666666666666
bluex_acc,exam_id__UNICAMP_2023: 0.7209302325581395
bluex_acc,exam_id__UNICAMP_2024: 0.6444444444444445
bluex_acc,exam_id__USP_2018: 0.5925925925925926
bluex_acc,exam_id__USP_2019: 0.7
bluex_acc,exam_id__USP_2020: 0.6607142857142857
bluex_acc,exam_id__USP_2021: 0.75
bluex_acc,exam_id__USP_2022: 0.5918367346938775
bluex_acc,exam_id__USP_2023: 0.7045454545454546
bluex_acc,exam_id__USP_2024: 0.7073170731707317
bluex_acc_stderr,all: 0.010157757528559894
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.037594875406546435
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.03953375278949041
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03605420458368598
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.0405315180666698
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.039752636457935614
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04360776726045774
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.039432470230869696
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.041213076472343936
bluex_acc_stderr,exam_id__USP_2018: 0.038650432775192395
bluex_acc_stderr,exam_id__USP_2019: 0.04175277819931915
bluex_acc_stderr,exam_id__USP_2020: 0.03644397183647981
bluex_acc_stderr,exam_id__USP_2021: 0.034773619645811646
bluex_acc_stderr,exam_id__USP_2022: 0.040532053004604704
bluex_acc_stderr,exam_id__USP_2023: 0.039742872820681924
bluex_acc_stderr,exam_id__USP_2024: 0.040951553558739306
bluex_alias: bluex
calame_pt_acc: 0.6107899807321773
calame_pt_acc_stderr: 0.0107035762556229
calame_pt_alias: calame_pt
calame_pt_perplexity: 5.713055201421455
calame_pt_perplexity_stderr: 0.29495381614560345
enem_challenge_acc,all: 0.7753673897830651
enem_challenge_acc,exam_id__2009: 0.7478260869565218
enem_challenge_acc,exam_id__2010: 0.811965811965812
enem_challenge_acc,exam_id__2011: 0.8461538461538461
enem_challenge_acc,exam_id__2012: 0.8448275862068966
enem_challenge_acc,exam_id__2013: 0.7777777777777778
enem_challenge_acc,exam_id__2014: 0.8073394495412844
enem_challenge_acc,exam_id__2015: 0.8067226890756303
enem_challenge_acc,exam_id__2016: 0.743801652892562
enem_challenge_acc,exam_id__2016_2: 0.7154471544715447
enem_challenge_acc,exam_id__2017: 0.75
enem_challenge_acc,exam_id__2022: 0.6842105263157895
enem_challenge_acc,exam_id__2023: 0.7851851851851852
enem_challenge_acc_stderr,all: 0.006377145135723042
enem_challenge_acc_stderr,exam_id__2009: 0.023447252641875988
enem_challenge_acc_stderr,exam_id__2010: 0.02087704326839612
enem_challenge_acc_stderr,exam_id__2011: 0.01921565112452091
enem_challenge_acc_stderr,exam_id__2012: 0.01944793905815595
enem_challenge_acc_stderr,exam_id__2013: 0.023084030560191867
enem_challenge_acc_stderr,exam_id__2014: 0.021892563584984096
enem_challenge_acc_stderr,exam_id__2015: 0.020893018955083217
enem_challenge_acc_stderr,exam_id__2016: 0.022776450345788787
enem_challenge_acc_stderr,exam_id__2016_2: 0.023503035027562222
enem_challenge_acc_stderr,exam_id__2017: 0.023138027075607918
enem_challenge_acc_stderr,exam_id__2022: 0.02320588990454305
enem_challenge_acc_stderr,exam_id__2023: 0.020404682391600704
enem_challenge_alias: enem
faquad_nli_acc,all: 0.7876923076923077
faquad_nli_acc_stderr,all: 0.01133278097111669
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.45449901481427424
faquad_nli_f1_macro_stderr,all: 0.008069363645658589
global_piqa_completions_por_latn_braz_acc: 0.84
global_piqa_completions_por_latn_braz_acc_bytes: 0.83
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.03775251680686369
global_piqa_completions_por_latn_braz_acc_norm: 0.83
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.03775251680686369
global_piqa_completions_por_latn_braz_acc_stderr: 0.03684529491774706
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
hatebr_offensive_acc,all: 0.665
hatebr_offensive_acc_stderr,all: 0.00890653495166499
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.6234605955470173
hatebr_offensive_f1_macro_stderr,all: 0.00951858317428499
hellaswag_poly_pt_acc: 0.4838010618701918
hellaswag_poly_pt_acc_norm: 0.6531585220500596
hellaswag_poly_pt_acc_norm_stderr: 0.004954741713215018
hellaswag_poly_pt_acc_stderr: 0.00520221346811777
hellaswag_poly_pt_alias: hellaswag_poly_pt
lambada_poly_pt_acc: 0.6252668348534834
lambada_poly_pt_acc_stderr: 0.006743817908692071
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 6.574656712295472
lambada_poly_pt_perplexity_stderr: 0.18832300331707774
mmlu_poly_pt_acc: 0.6540078054638246
mmlu_poly_pt_acc_stderr: 0.004121199159002156
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.584510250569476
oab_exams_acc,exam_id__2010-01: 0.3764705882352941
oab_exams_acc,exam_id__2010-02: 0.59
oab_exams_acc,exam_id__2011-03: 0.5656565656565656
oab_exams_acc,exam_id__2011-04: 0.5125
oab_exams_acc,exam_id__2011-05: 0.6625
oab_exams_acc,exam_id__2012-06: 0.625
oab_exams_acc,exam_id__2012-06a: 0.7375
oab_exams_acc,exam_id__2012-07: 0.6125
oab_exams_acc,exam_id__2012-08: 0.55
oab_exams_acc,exam_id__2012-09: 0.4805194805194805
oab_exams_acc,exam_id__2013-10: 0.65
oab_exams_acc,exam_id__2013-11: 0.625
oab_exams_acc,exam_id__2013-12: 0.65
oab_exams_acc,exam_id__2014-13: 0.525
oab_exams_acc,exam_id__2014-14: 0.625
oab_exams_acc,exam_id__2014-15: 0.6538461538461539
oab_exams_acc,exam_id__2015-16: 0.5875
oab_exams_acc,exam_id__2015-17: 0.5897435897435898
oab_exams_acc,exam_id__2015-18: 0.575
oab_exams_acc,exam_id__2016-19: 0.5897435897435898
oab_exams_acc,exam_id__2016-20: 0.6125
oab_exams_acc,exam_id__2016-20a: 0.55
oab_exams_acc,exam_id__2016-21: 0.4875
oab_exams_acc,exam_id__2017-22: 0.6375
oab_exams_acc,exam_id__2017-23: 0.525
oab_exams_acc,exam_id__2017-24: 0.6375
oab_exams_acc,exam_id__2018-25: 0.5625
oab_exams_acc_stderr,all: 0.006071412237214423
oab_exams_acc_stderr,exam_id__2010-01: 0.030361740131894334
oab_exams_acc_stderr,exam_id__2010-02: 0.028445183897774385
oab_exams_acc_stderr,exam_id__2011-03: 0.02862790167127372
oab_exams_acc_stderr,exam_id__2011-04: 0.03224493616787287
oab_exams_acc_stderr,exam_id__2011-05: 0.030467009008680036
oab_exams_acc_stderr,exam_id__2012-06: 0.031220571629946996
oab_exams_acc_stderr,exam_id__2012-06a: 0.02843559794708646
oab_exams_acc_stderr,exam_id__2012-07: 0.03139589988285276
oab_exams_acc_stderr,exam_id__2012-08: 0.03226132851591818
oab_exams_acc_stderr,exam_id__2012-09: 0.032830805301195386
oab_exams_acc_stderr,exam_id__2013-10: 0.030611536360793473
oab_exams_acc_stderr,exam_id__2013-11: 0.031310092407276585
oab_exams_acc_stderr,exam_id__2013-12: 0.030692719997990617
oab_exams_acc_stderr,exam_id__2014-13: 0.032358129209763435
oab_exams_acc_stderr,exam_id__2014-14: 0.031382714558388446
oab_exams_acc_stderr,exam_id__2014-15: 0.03109637957099322
oab_exams_acc_stderr,exam_id__2015-16: 0.031957806650269406
oab_exams_acc_stderr,exam_id__2015-17: 0.03208206142728883
oab_exams_acc_stderr,exam_id__2015-18: 0.03182114971496286
oab_exams_acc_stderr,exam_id__2016-19: 0.03228511767428725
oab_exams_acc_stderr,exam_id__2016-20: 0.031372223696958024
oab_exams_acc_stderr,exam_id__2016-20a: 0.0321935167686262
oab_exams_acc_stderr,exam_id__2016-21: 0.03209267502993051
oab_exams_acc_stderr,exam_id__2017-22: 0.03105400471909683
oab_exams_acc_stderr,exam_id__2017-23: 0.03235792164586319
oab_exams_acc_stderr,exam_id__2017-24: 0.031098329350728315
oab_exams_acc_stderr,exam_id__2018-25: 0.03209246971016282
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.6145710928319624
portuguese_hate_speech_acc_stderr,all: 0.011835075822813054
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.6103088177807561
portuguese_hate_speech_f1_macro_stderr,all: 0.011900091760317547
tweetsentbr_acc,all: 0.7298507462686568
tweetsentbr_acc_stderr,all: 0.006986496038388035
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.7027752533485003
tweetsentbr_f1_macro_stderr,all: 0.007392699151541939
step: 50000

22
evals_all_steps.csv Normal file

File diff suppressed because one or more lines are too long

27
evals_for_comparison.csv Normal file

File diff suppressed because one or more lines are too long

14
generation_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 49109,
"transformers_version": "4.53.2",
"do_sample": true,
"max_new_tokens": 1024,
"renormalize_logits": true,
"repetition_penalty": 1.2,
"temperature": 0.1,
"top_k": 50,
"top_p": 1.0,
"use_cache": false
}

3
logo.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1856d91c3b35390cee5122902d94044657c67df7034ca4005316275c404fc8a0
size 197189

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:869e1d78e03c3f52e59df11c36b7409a393fb33c1a9a990d2c443b1505390122
size 4996742472

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7de28cf45b0c898934748c85b310680f859797b417666e05dbd5b2246225d0ee
size 2773643768

View File

@@ -0,0 +1,407 @@
{
"metadata": {
"total_parameters": 3759341056,
"total_size": 7770340352
},
"weight_map": {
"lm_head.weight": "model-00002-of-00002.safetensors",
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.norm.weight": "model-00002-of-00002.safetensors"
}
}

82
ruler.yaml Normal file
View File

@@ -0,0 +1,82 @@
model_name: Tucano2-qwen-3.7B-Base
results:
niah_pt_multikey_1_1024: 0.596
niah_pt_multikey_1_1024_stderr: 0.021966635293832883
niah_pt_multikey_1_2048: 0.614
niah_pt_multikey_1_2048_stderr: 0.021793529219281196
niah_pt_multikey_1_4096: 0.436
niah_pt_multikey_1_4096_stderr: N/A
niah_pt_multikey_1_alias: " - niah_pt_multikey_1"
niah_pt_multikey_2_1024: 0.67
niah_pt_multikey_2_1024_stderr: 0.021049612166134782
niah_pt_multikey_2_2048: 0.682
niah_pt_multikey_2_2048_stderr: 0.020847571620814014
niah_pt_multikey_2_4096: 0.608
niah_pt_multikey_2_4096_stderr: N/A
niah_pt_multikey_2_alias: " - niah_pt_multikey_2"
niah_pt_multikey_3_1024: 0.842
niah_pt_multikey_3_1024_stderr: 0.0163280498045799
niah_pt_multikey_3_2048: 0.856
niah_pt_multikey_3_2048_stderr: 0.015716934945725784
niah_pt_multikey_3_4096: 0.786
niah_pt_multikey_3_4096_stderr: N/A
niah_pt_multikey_3_alias: " - niah_pt_multikey_3"
niah_pt_multiquery_1024: 0.5815
niah_pt_multiquery_1024_stderr: 0.015245666949052864
niah_pt_multiquery_2048: 0.5405
niah_pt_multiquery_2048_stderr: 0.014383437229433934
niah_pt_multiquery_4096: 0.493
niah_pt_multiquery_4096_stderr: N/A
niah_pt_multiquery_alias: " - niah_pt_multiquery"
niah_pt_multivalue_1024: 0.5685
niah_pt_multivalue_1024_stderr: 0.015209475872383808
niah_pt_multivalue_2048: 0.3425
niah_pt_multivalue_2048_stderr: 0.014868747738751817
niah_pt_multivalue_4096: 0.355
niah_pt_multivalue_4096_stderr: N/A
niah_pt_multivalue_alias: " - niah_pt_multivalue"
niah_pt_single_1_1024: 0.63
niah_pt_single_1_1024_stderr: 0.021613289165165816
niah_pt_single_1_2048: 0.578
niah_pt_single_1_2048_stderr: 0.022109039310618563
niah_pt_single_1_4096: 0.462
niah_pt_single_1_4096_stderr: N/A
niah_pt_single_1_alias: " - niah_pt_single_1"
niah_pt_single_2_1024: 0.538
niah_pt_single_2_1024_stderr: 0.022318338119870523
niah_pt_single_2_2048: 0.548
niah_pt_single_2_2048_stderr: 0.02227969410784354
niah_pt_single_2_4096: 0.566
niah_pt_single_2_4096_stderr: N/A
niah_pt_single_2_alias: " - niah_pt_single_2"
niah_pt_single_3_1024: 0.716
niah_pt_single_3_1024_stderr: 0.020186703693570777
niah_pt_single_3_2048: 0.726
niah_pt_single_3_2048_stderr: 0.019966103540279518
niah_pt_single_3_4096: 0.702
niah_pt_single_3_4096_stderr: N/A
niah_pt_single_3_alias: " - niah_pt_single_3"
ruler_pt_4096: 0.5871696969696969
ruler_pt_4096_stderr: N/A
ruler_pt_alias: ruler_pt
ruler_pt_cwe_1024: 0.3028
ruler_pt_cwe_1024_stderr: 0.0064114376754463
ruler_pt_cwe_2048: 0.253
ruler_pt_cwe_2048_stderr: 0.006635515060659607
ruler_pt_cwe_4096: 0.3358
ruler_pt_cwe_4096_stderr: N/A
ruler_pt_cwe_alias: " - ruler_pt_cwe"
ruler_pt_fwe_1024: 0.864
ruler_pt_fwe_1024_stderr: 0.010004852975181538
ruler_pt_fwe_2048: 0.8266666666666667
ruler_pt_fwe_2048_stderr: 0.011102646519713471
ruler_pt_fwe_4096: 0.7346666666666666
ruler_pt_fwe_4096_stderr: N/A
ruler_pt_fwe_alias: " - ruler_pt_fwe"
ruler_pt_vt_1024: 0.9816
ruler_pt_vt_1024_stderr: 0.004767424532527671
ruler_pt_vt_2048: 0.9708
ruler_pt_vt_2048_stderr: 0.006233171331649675
ruler_pt_vt_4096: 0.9803999999999999
ruler_pt_vt_4096_stderr: N/A
ruler_pt_vt_alias: " - ruler_pt_vt"

30
special_tokens_map.json Normal file
View File

@@ -0,0 +1,30 @@
{
"bos_token": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "<|unk|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

463711
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

397
tokenizer_config.json Normal file
View File

@@ -0,0 +1,397 @@
{
"add_bos_token": false,
"add_eos_token": false,
"add_prefix_space": null,
"added_tokens_decoder": {
"0": {
"content": "<|unk|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"49109": {
"content": "<|pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"49110": {
"content": "<tools>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49111": {
"content": "</tools>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49112": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49113": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49114": {
"content": "<tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49115": {
"content": "</tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49116": {
"content": "<think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49117": {
"content": "</think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49118": {
"content": "<answer>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49119": {
"content": "</answer>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49120": {
"content": "<context>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49121": {
"content": "</context>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49122": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49123": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49124": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49125": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49126": {
"content": "<|image|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49127": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49128": {
"content": "<|image_placeholder|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49129": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49130": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49131": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49132": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49133": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49134": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49135": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49136": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49137": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49138": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49139": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49140": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49141": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49142": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49143": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49144": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49145": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49146": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49147": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49148": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"49149": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49150": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"49151": {
"content": " ",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"bos_token": "<|im_start|>",
"bos_token_id": 1,
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"eos_token_id": 2,
"extra_special_tokens": {},
"legacy": false,
"model_input_names": [
"input_ids",
"attention_mask"
],
"model_max_length": 4096,
"pad_token": "<|pad|>",
"pad_token_id": 49109,
"padding_side": "right",
"sp_model_kwargs": {},
"spaces_between_special_tokens": false,
"tokenizer_class": "PreTrainedTokenizerFast",
"truncation_side": "right",
"unk_token": "<|unk|>",
"unk_token_id": 0,
"use_default_system_prompt": false
}

3
train_logs.parquet Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b8b5e183f71ab4fafad5f542e54f1cf231935a2de4d14f1d0b67e23fd4ebc499
size 1292663

99
training_config.yaml Normal file
View File

@@ -0,0 +1,99 @@
# Directory settings
checkpoint_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/models/Tucano2-qwen-3.7B-Base"
train_dataset_dir:
# Total: ~48B
# # Web Text (~28B, 58%)
- /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/4 # 28B (PT)
- /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/5 # 0.1B (PT)
- /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/5 # 0.1B (PT)
# # Synthetic Text (~20B, 42%)
- /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2_synth # 10B (PT)
- /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2_synth # 10B (PT)
val_dataset_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/tokenized/validation"
dataset_type: "parquet"
cache_dir: "/lustre/scratch/data/polyglot_datasets/.cache"
# Data loading settings
pin_memory: true
num_workers_for_dataloader: 16
prefetch_factor: 4
shuffle_dataset: true
mask_eos_token: false
mask_pad_token: false
# Model architecture settings
vocab_size: 49152
num_hidden_layers: 36
num_attention_heads: 32
num_key_value_heads: 8
head_dim: 128
hidden_size: 2560
intermediate_size: 9728
max_position_embeddings: 4096
tie_word_embeddings: true
hidden_act: "silu"
output_hidden_states: false
attn_implementation: "flash_attention_2"
use_cache: false
no_rope_layer_interval: null
rope_theta: 1000000.0
rope_scale_factor: null
rms_norm_eps: 0.000001
# Training settings
total_batch_size: 1048576
micro_batch_size: 16
eval_micro_batch_size: 16
num_train_epochs: 1
warmup_steps: 100
max_learning_rate: 0.000075
min_learning_rate: 0.0
muon_learning_rate: 0.001
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
eps: 0.00000001
lr_decay_type: "cosine"
use_sqrt: true
lr_decay_iters_coef: 1.
seed: 42
max_steps: 50000
max_grad_norm: 1.0
# Precision, optimization, and sharding settings
torch_compile: false
mat_mul_precision: "highest"
tf32: true
bf16: true
gradient_checkpointing: true
use_liger_kernel: true
fsdp_mixed_precision: true
dp_shard: null
full_shard: false
cpu_offload: false
explicit_prefetching: true
# Hub settings
push_to_hub: false
hub_token: null
hub_model_id: null
# Tokenizer and Reference model
tokenizer_name_or_path: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/models/Tucano2-qwen-3.7B-Base/step-0"
chat_template_path: null
reference_model: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/models/Tucano2-qwen-3.7B-Base/step-0"
continual_pretraining: true
# Checkpoint settings
resume_from_checkpoint: null
checkpointing_steps: 2500
begin_new_stage: false
stage_name: "single_cosine"
# Miscellaneous settings
sanity_check: false
sanity_check_num_samples: 100000
wandb_token: null
wandb_id: "tucano2-qwen-3.7b"
wandb_project: "Polyglot"
wandb_desc: "Developing LLMs for low-resource languages"

3
val_logs.parquet Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b3be7f2668b7854e5f47eac5ca1497515931bb63aed477d8003e99d58c110d22
size 1948