From 6cbb6550774d13447060d7c7a3283afe52796d63 Mon Sep 17 00:00:00 2001
From: ModelHub XC <noreply@modelhub.org.cn>
Date: Fri, 1 May 2026 10:37:03 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?=
 =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Model: EleutherAI/pythia-160m-deduped
Source: Original Platform
---
 .gitattributes          |  49 +++++++
 README.md               | 281 ++++++++++++++++++++++++++++++++++++++++
 config.json             |  24 ++++
 configuration.json      |   1 +
 model.safetensors       |   3 +
 pytorch_model.bin       |   3 +
 special_tokens_map.json |   5 +
 tokenizer.json          |   3 +
 tokenizer_config.json   |   9 ++
 9 files changed, 378 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 README.md
 create mode 100644 config.json
 create mode 100644 configuration.json
 create mode 100644 model.safetensors
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..21b3632
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,49 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d79bdb9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,281 @@
+---
+language:
+- en
+tags:
+- pytorch
+- causal-lm
+- pythia
+license: apache-2.0
+datasets:
+- EleutherAI/the_pile_deduplicated
+---
+
+The *Pythia Scaling Suite* is a collection of models developed to facilitate 
+interpretability research [(see paper)](https://arxiv.org/pdf/2304.01373.pdf). 
+It contains two sets of eight models of sizes 
+70M, 160M, 410M, 1B, 1.4B, 2.8B, 6.9B, and 12B. For each size, there are two 
+models: one trained on the Pile, and one trained on the Pile after the dataset 
+has been globally deduplicated. All 8 model sizes are trained on the exact 
+same data, in the exact same order. We also provide 154 intermediate 
+checkpoints per model, hosted on Hugging Face as branches.
+
+The Pythia model suite was designed to promote scientific 
+research on large language models, especially interpretability research. 
+Despite not centering downstream performance as a design goal, we find the 
+models <a href="#evaluations">match or exceed</a> the performance of 
+similar and same-sized models, such as those in the OPT and GPT-Neo suites.
+
+<details>
+  <summary style="font-weight:600">Details on previous early release and naming convention.</summary>
+
+Previously, we released an early version of the Pythia suite to the public. 
+However, we decided to retrain the model suite to address a few hyperparameter 
+discrepancies. This model card <a href="#changelog">lists the changes</a>; 
+see appendix B in the Pythia paper for further discussion. We found no 
+difference in benchmark performance between the two Pythia versions. 
+The old models are 
+[still available](https://huggingface.co/models?other=pythia_v0), but we 
+suggest the retrained suite if you are just starting to use Pythia.<br>
+**This is the current release.**
+
+Please note that all models in the *Pythia* suite were renamed in January 
+2023. For clarity, a <a href="#naming-convention-and-parameter-count">table 
+comparing the old and new names</a> is provided in this model card, together 
+with exact parameter counts.
+</details>
+<br>
+
+# Pythia-160M-deduped
+
+## Model Details
+
+- Developed by: [EleutherAI](http://eleuther.ai)
+- Model type: Transformer-based Language Model
+- Language: English
+- Learn more: [Pythia's GitHub repository](https://github.com/EleutherAI/pythia)
+ for training procedure, config files, and details on how to use.
+[See paper](https://arxiv.org/pdf/2304.01373.pdf) for more evals and implementation
+ details.
+- Library: [GPT-NeoX](https://github.com/EleutherAI/gpt-neox)
+- License: Apache 2.0
+- Contact: to ask questions about this model, join the [EleutherAI 
+Discord](https://discord.gg/zBGx3azzUn), and post them in `#release-discussion`.
+ Please read the existing *Pythia* documentation before asking about it in the 
+ EleutherAI Discord. For general correspondence: [contact@eleuther.
+ ai](mailto:contact@eleuther.ai).
+
+<figure>
+
+| Pythia model | Non-Embedding Params | Layers | Model Dim | Heads | Batch Size | Learning Rate         | Equivalent Models      |
+| -----------: | -------------------: | :----: | :-------: | :---: | :--------: | :-------------------: | :--------------------: |
+| 70M          | 18,915,328           | 6      | 512       | 8     | 2M         | 1.0 x 10<sup>-3</sup> | —                      |
+| 160M         | 85,056,000           | 12     | 768       | 12    | 2M         | 6.0 x 10<sup>-4</sup> | GPT-Neo 125M, OPT-125M |
+| 410M         | 302,311,424          | 24     | 1024      | 16    | 2M         | 3.0 x 10<sup>-4</sup> | OPT-350M               |
+| 1.0B         | 805,736,448          | 16     | 2048      | 8     | 2M         | 3.0 x 10<sup>-4</sup> | —                      |
+| 1.4B         | 1,208,602,624        | 24     | 2048      | 16    | 2M         | 2.0 x 10<sup>-4</sup> | GPT-Neo 1.3B, OPT-1.3B |
+| 2.8B         | 2,517,652,480        | 32     | 2560      | 32    | 2M         | 1.6 x 10<sup>-4</sup> | GPT-Neo 2.7B, OPT-2.7B |
+| 6.9B         | 6,444,163,072        | 32     | 4096      | 32    | 2M         | 1.2 x 10<sup>-4</sup> | OPT-6.7B               |
+| 12B          | 11,327,027,200       | 36     | 5120      | 40    | 2M         | 1.2 x 10<sup>-4</sup> | —                      |
+<figcaption>Engineering details for the <i>Pythia Suite</i>. Deduped and 
+non-deduped models of a given size have the same hyperparameters. “Equivalent” 
+models have <b>exactly</b> the same architecture, and the same number of 
+non-embedding parameters.</figcaption>
+</figure>
+
+## Uses and Limitations
+
+### Intended Use
+
+The primary intended use of Pythia is research on the behavior, functionality, 
+and limitations of large language models. This suite is intended to provide 
+a controlled setting for performing scientific experiments. We also provide 
+154 checkpoints per model: initial `step0`, 10 log-spaced checkpoints 
+`step{1,2,4...512}`, and 143 evenly-spaced checkpoints from `step1000` to 
+`step143000`. These checkpoints are hosted on Hugging Face as branches. Note 
+that branch `143000` corresponds exactly to the model checkpoint on the `main` 
+branch of each model.
+
+You may also further fine-tune and adapt Pythia-160M-deduped for deployment, 
+as long as your use is in accordance with the Apache 2.0 license. Pythia 
+models work with the Hugging Face [Transformers 
+Library](https://huggingface.co/docs/transformers/index). If you decide to use 
+pre-trained Pythia-160M-deduped as a basis for your fine-tuned model, please 
+conduct your own risk and bias assessment. 
+
+### Out-of-scope use
+
+The Pythia Suite is **not** intended for deployment. It is not a in itself 
+a product and cannot be used for human-facing interactions. For example, 
+the model may generate harmful or offensive text. Please evaluate the risks
+associated with your particular use case.
+
+Pythia models are English-language only, and are not suitable for translation 
+or generating text in other languages.
+
+Pythia-160M-deduped has not been fine-tuned for downstream contexts in which 
+language models are commonly deployed, such as writing genre prose, 
+or commercial chatbots. This means Pythia-160M-deduped will **not** 
+respond to a given prompt the way a product like ChatGPT does. This is because,
+ unlike this model, ChatGPT was fine-tuned using methods such as Reinforcement 
+Learning from Human Feedback (RLHF) to better “follow” human instructions.
+
+### Limitations and biases
+
+The core functionality of a large language model is to take a string of text 
+and predict the next token. The token used by the model need not produce the 
+most “accurate” text. Never rely on Pythia-160M-deduped to produce factually accurate 
+output.
+
+This model was trained on [the Pile](https://pile.eleuther.ai/), a dataset 
+known to contain profanity and texts that are lewd or otherwise offensive. 
+See [Section 6 of the Pile paper](https://arxiv.org/abs/2101.00027) for a 
+discussion of documented biases with regards to gender, religion, and race. 
+Pythia-160M-deduped may produce socially unacceptable or undesirable text, *even if* 
+the prompt itself does not include anything explicitly offensive. 
+
+If you plan on using text generated through, for example, the Hosted Inference 
+API, we recommend having a human curate the outputs of this language model 
+before presenting it to other people. Please inform your audience that the 
+text was generated by Pythia-160M-deduped.
+
+### Quickstart
+
+Pythia models can be loaded and used via the following code, demonstrated here 
+for the third `pythia-70m-deduped` checkpoint:
+
+```python
+from transformers import GPTNeoXForCausalLM, AutoTokenizer
+
+model = GPTNeoXForCausalLM.from_pretrained(
+  "EleutherAI/pythia-70m-deduped",
+  revision="step3000",
+  cache_dir="./pythia-70m-deduped/step3000",
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+  "EleutherAI/pythia-70m-deduped",
+  revision="step3000",
+  cache_dir="./pythia-70m-deduped/step3000",
+)
+
+inputs = tokenizer("Hello, I am", return_tensors="pt")
+tokens = model.generate(**inputs)
+tokenizer.decode(tokens[0])
+```
+
+Revision/branch `step143000` corresponds exactly to the model checkpoint on 
+the `main` branch of each model.<br>
+For more information on how to use all Pythia models, see [documentation on 
+GitHub](https://github.com/EleutherAI/pythia).
+
+## Training
+
+### Training data
+
+Pythia-160M-deduped was trained on the Pile **after the dataset has been globally 
+deduplicated**.<br>
+[The Pile](https://pile.eleuther.ai/) is a 825GiB general-purpose dataset in 
+English. It was created by EleutherAI specifically for training large language 
+models. It contains texts from 22 diverse sources, roughly broken down into 
+five categories: academic writing (e.g. arXiv), internet (e.g. CommonCrawl), 
+prose (e.g. Project Gutenberg), dialogue (e.g. YouTube subtitles), and 
+miscellaneous (e.g. GitHub, Enron Emails). See [the Pile 
+paper](https://arxiv.org/abs/2101.00027) for a breakdown of all data sources, 
+methodology, and a discussion of ethical implications. Consult [the 
+datasheet](https://arxiv.org/abs/2201.07311) for more detailed documentation 
+about the Pile and its component datasets. The Pile can be downloaded from 
+the [official website](https://pile.eleuther.ai/), or from a [community 
+mirror](https://the-eye.eu/public/AI/pile/).
+
+### Training procedure
+
+All models were trained on the exact same data, in the exact same order. Each 
+model saw 299,892,736,000 tokens during training, and 143 checkpoints for each 
+model are saved every 2,097,152,000 tokens, spaced evenly throughout training, 
+from `step1000` to `step143000` (which is the same as `main`). In addition, we 
+also provide frequent early checkpoints: `step0` and `step{1,2,4...512}`.
+This corresponds to training for just under 1 epoch on the Pile for 
+non-deduplicated models, and about 1.5 epochs on the deduplicated Pile.
+
+All *Pythia* models trained for 143000 steps at a batch size 
+of 2M (2,097,152 tokens).<br>
+See [GitHub](https://github.com/EleutherAI/pythia) for more details on training
+ procedure, including [how to reproduce 
+ it](https://github.com/EleutherAI/pythia/blob/main/README.md#reproducing-training).<br>
+Pythia uses the same tokenizer as [GPT-NeoX-
+20B](https://huggingface.co/EleutherAI/gpt-neox-20b).
+
+## Evaluations
+
+All 16 *Pythia* models were evaluated using the [LM Evaluation 
+Harness](https://github.com/EleutherAI/lm-evaluation-harness). You can access 
+the results by model and step at `results/json/*` in the [GitHub 
+repository](https://github.com/EleutherAI/pythia/tree/main/results/json/).<br>
+Expand the sections below to see plots of evaluation results for all 
+Pythia and Pythia-deduped models compared with OPT and BLOOM.
+
+<details>
+  <summary>LAMBADA – OpenAI</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/lambada_openai_v1.png" style="width:auto"/>
+</details>
+
+<details>
+  <summary>Physical Interaction: Question Answering (PIQA)</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/piqa_v1.png" style="width:auto"/>
+</details>
+
+<details>
+  <summary>WinoGrande</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/winogrande_v1.png" style="width:auto"/>
+</details>
+
+<details>
+  <summary>AI2 Reasoning Challenge—Easy Set</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/arc_easy_v1.png" style="width:auto"/>
+</details>
+
+<details>
+  <summary>SciQ</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/sciq_v1.png" style="width:auto"/>
+</details>
+
+## Changelog
+
+This section compares differences between previously released 
+[Pythia v0](https://huggingface.co/models?other=pythia_v0) and the current 
+models. See Appendix B of the Pythia paper for further discussion of these 
+changes and the motivation behind them. We found that retraining Pythia had no 
+impact on benchmark performance.
+
+- All model sizes are now trained with uniform batch size of 2M tokens. 
+Previously, the models of size 160M, 410M, and 1.4B parameters were trained 
+with batch sizes of 4M tokens.
+- We added checkpoints at initialization (step 0) and steps {1,2,4,8,16,32,64,
+128,256,512} in addition to every 1000 training steps.
+- Flash Attention was used in the new retrained suite.
+- We remedied a minor inconsistency that existed in the original suite: all 
+models of size 2.8B parameters or smaller had a learning rate (LR) schedule 
+which decayed to a minimum LR of 10% the starting LR rate, but the 6.9B and 
+12B models all used an LR schedule which decayed to a minimum LR of 0. In 
+the redone training runs, we rectified this inconsistency: all models now were 
+trained with LR decaying to a minimum of 0.1× their maximum LR.
+
+### Naming convention and parameter count
+
+*Pythia* models were renamed in January 2023. It is possible that the old 
+naming convention still persists in some documentation by accident. The 
+current naming convention (70M, 160M, etc.) is based on total parameter count. 
+
+<figure style="width:32em">
+  
+| current Pythia suffix | old suffix | total params   | non-embedding params |
+| --------------------: | ---------: | -------------: | -------------------: |
+| 70M                   | 19M        | 70,426,624     | 18,915,328           |
+| 160M                  | 125M       | 162,322,944    | 85,056,000           |
+| 410M                  | 350M       | 405,334,016    | 302,311,424          |
+| 1B                    | 800M       | 1,011,781,632  | 805,736,448          |
+| 1.4B                  | 1.3B       | 1,414,647,808  | 1,208,602,624        |
+| 2.8B                  | 2.7B       | 2,775,208,960  | 2,517,652,480        |
+| 6.9B                  | 6.7B       | 6,857,302,016  | 6,444,163,072        |
+| 12B                   | 13B        | 11,846,072,320 | 11,327,027,200       |
+</figure>
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..b8368ff
--- /dev/null
+++ b/config.json
@@ -0,0 +1,24 @@
+{
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "hidden_act": "gelu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.24.0",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}
diff --git a/configuration.json b/configuration.json
new file mode 100644
index 0000000..bbeeda1
--- /dev/null
+++ b/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
\ No newline at end of file
diff --git a/model.safetensors b/model.safetensors
new file mode 100644
index 0000000..03754f4
--- /dev/null
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbaae5c00917b163baa499fc8eb64859ee0c850c5fdecfc32f4d70dc07213575
+size 374998696
diff --git a/pytorch_model.bin b/pytorch_model.bin
new file mode 100644
index 0000000..c922476
--- /dev/null
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99f46ea77004069e1a95dfc4e3aab35846c09e6c861b372a7213f759c9e1a7da
+size 375036845
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..0204ed1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,5 @@
+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..8fa6a67
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c24618a1b3e6a38167beff1c72cffd126c3a66254347304b50547d12c5f25624
+size 2113710
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..f1860ed
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,9 @@
+{
+  "add_prefix_space": false,
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "name_or_path": "EleutherAI/gpt-neox-20b",
+  "special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": "<|endoftext|>"
+}