Update metadata with huggingface_hub

This commit is contained in:
ai-modelscope
2025-02-18 21:37:10 +08:00
parent af0702b6ee
commit 5e18b0bc8c
27 changed files with 276 additions and 63 deletions

54
.gitattributes vendored
View File

@@ -1,47 +1,59 @@
*.7z filter=lfs diff=lfs merge=lfs -text *.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q6_K_L.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_L.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_L.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q2_K_L.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
allenai_Llama-3.1-Tulu-3.1-8B.imatrix filter=lfs diff=lfs merge=lfs -text

210
README.md
View File

@@ -1,47 +1,175 @@
--- ---
license: Apache License 2.0 quantized_by: bartowski
pipeline_tag: text-generation
#model-type: license: llama3.1
##如 gpt、phi、llama、chatglm、baichuan 等 base_model: allenai/Llama-3.1-Tulu-3.1-8B
#- gpt language:
- en
#domain: datasets:
##如 nlp、cv、audio、multi-modal - allenai/RLVR-GSM-MATH-IF-Mixed-Constraints
#- nlp
#language:
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
#- cn
#metrics:
##如 CIDEr、Blue、ROUGE 等
#- CIDEr
#tags:
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
#- pretrained
#tools:
##如 vllm、fastchat、llamacpp、AdaSeq 等
#- vllm
--- ---
### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
#### 您可以通过如下git clone命令或者ModelScope SDK来下载模型
SDK下载 ## Llamacpp imatrix Quantizations of Llama-3.1-Tulu-3.1-8B by allenai
```bash
#安装ModelScope Using <a href="https://github.com/ggerganov/llama.cpp/">llama.cpp</a> release <a href="https://github.com/ggerganov/llama.cpp/releases/tag/b4688">b4688</a> for quantization.
pip install modelscope
Original model: https://huggingface.co/allenai/Llama-3.1-Tulu-3.1-8B
All quants made using imatrix option with dataset from [here](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8)
Run them in [LM Studio](https://lmstudio.ai/)
Run them directly with [llama.cpp](https://github.com/ggerganov/llama.cpp), or any other llama.cpp based project
## Prompt format
``` ```
```python <|system|>
#SDK模型下载 {system_prompt}
from modelscope import snapshot_download <|user|>
model_dir = snapshot_download('bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF') {prompt}
``` <|assistant|>
Git下载
```
#Git模型下载
git clone https://www.modelscope.cn/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF.git
``` ```
<p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p> ## Download a file (not the whole branch) from below:
| Filename | Quant type | File Size | Split | Description |
| -------- | ---------- | --------- | ----- | ----------- |
| [Llama-3.1-Tulu-3.1-8B-Q8_0.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q8_0.gguf) | Q8_0 | 8.54GB | false | Extremely high quality, generally unneeded but max available quant. |
| [Llama-3.1-Tulu-3.1-8B-Q6_K_L.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q6_K_L.gguf) | Q6_K_L | 6.85GB | false | Uses Q8_0 for embed and output weights. Very high quality, near perfect, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q6_K.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q6_K.gguf) | Q6_K | 6.60GB | false | Very high quality, near perfect, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q5_K_L.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_L.gguf) | Q5_K_L | 6.06GB | false | Uses Q8_0 for embed and output weights. High quality, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q5_K_M.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_M.gguf) | Q5_K_M | 5.73GB | false | High quality, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q5_K_S.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q5_K_S.gguf) | Q5_K_S | 5.60GB | false | High quality, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q4_K_L.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_L.gguf) | Q4_K_L | 5.31GB | false | Uses Q8_0 for embed and output weights. Good quality, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q4_1.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q4_1.gguf) | Q4_1 | 5.13GB | false | Legacy format, similar performance to Q4_K_S but with improved tokens/watt on Apple silicon. |
| [Llama-3.1-Tulu-3.1-8B-Q4_K_M.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_M.gguf) | Q4_K_M | 4.92GB | false | Good quality, default size for most use cases, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q3_K_XL.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_XL.gguf) | Q3_K_XL | 4.78GB | false | Uses Q8_0 for embed and output weights. Lower quality but usable, good for low RAM availability. |
| [Llama-3.1-Tulu-3.1-8B-Q4_K_S.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_S.gguf) | Q4_K_S | 4.69GB | false | Slightly lower quality with more space savings, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q4_0.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q4_0.gguf) | Q4_0 | 4.68GB | false | Legacy format, offers online repacking for ARM and AVX CPU inference. |
| [Llama-3.1-Tulu-3.1-8B-IQ4_NL.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ4_NL.gguf) | IQ4_NL | 4.68GB | false | Similar to IQ4_XS, but slightly larger. Offers online repacking for ARM CPU inference. |
| [Llama-3.1-Tulu-3.1-8B-IQ4_XS.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ4_XS.gguf) | IQ4_XS | 4.45GB | false | Decent quality, smaller than Q4_K_S with similar performance, *recommended*. |
| [Llama-3.1-Tulu-3.1-8B-Q3_K_L.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_L.gguf) | Q3_K_L | 4.32GB | false | Lower quality but usable, good for low RAM availability. |
| [Llama-3.1-Tulu-3.1-8B-Q3_K_M.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_M.gguf) | Q3_K_M | 4.02GB | false | Low quality. |
| [Llama-3.1-Tulu-3.1-8B-IQ3_M.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ3_M.gguf) | IQ3_M | 3.78GB | false | Medium-low quality, new method with decent performance comparable to Q3_K_M. |
| [Llama-3.1-Tulu-3.1-8B-Q2_K_L.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q2_K_L.gguf) | Q2_K_L | 3.69GB | false | Uses Q8_0 for embed and output weights. Very low quality but surprisingly usable. |
| [Llama-3.1-Tulu-3.1-8B-Q3_K_S.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q3_K_S.gguf) | Q3_K_S | 3.66GB | false | Low quality, not recommended. |
| [Llama-3.1-Tulu-3.1-8B-IQ3_XS.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ3_XS.gguf) | IQ3_XS | 3.52GB | false | Lower quality, new method with decent performance, slightly better than Q3_K_S. |
| [Llama-3.1-Tulu-3.1-8B-IQ3_XXS.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ3_XXS.gguf) | IQ3_XXS | 3.27GB | false | Lower quality, new method with decent performance, comparable to Q3 quants. |
| [Llama-3.1-Tulu-3.1-8B-Q2_K.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-Q2_K.gguf) | Q2_K | 3.18GB | false | Very low quality but surprisingly usable. |
| [Llama-3.1-Tulu-3.1-8B-IQ2_M.gguf](https://huggingface.co/bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF/blob/main/allenai_Llama-3.1-Tulu-3.1-8B-IQ2_M.gguf) | IQ2_M | 2.95GB | false | Relatively low quality, uses SOTA techniques to be surprisingly usable. |
## Embed/output weights
Some of these quants (Q3_K_XL, Q4_K_L etc) are the standard quantization method with the embeddings and output weights quantized to Q8_0 instead of what they would normally default to.
## Downloading using huggingface-cli
<details>
<summary>Click to view download instructions</summary>
First, make sure you have hugginface-cli installed:
```
pip install -U "huggingface_hub[cli]"
```
Then, you can target the specific file you want:
```
huggingface-cli download bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF --include "allenai_Llama-3.1-Tulu-3.1-8B-Q4_K_M.gguf" --local-dir ./
```
If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
```
huggingface-cli download bartowski/allenai_Llama-3.1-Tulu-3.1-8B-GGUF --include "allenai_Llama-3.1-Tulu-3.1-8B-Q8_0/*" --local-dir ./
```
You can either specify a new local-dir (allenai_Llama-3.1-Tulu-3.1-8B-Q8_0) or download them all in place (./)
</details>
## ARM/AVX information
Previously, you would download Q4_0_4_4/4_8/8_8, and these would have their weights interleaved in memory in order to improve performance on ARM and AVX machines by loading up more data in one pass.
Now, however, there is something called "online repacking" for weights. details in [this PR](https://github.com/ggerganov/llama.cpp/pull/9921). If you use Q4_0 and your hardware would benefit from repacking weights, it will do it automatically on the fly.
As of llama.cpp build [b4282](https://github.com/ggerganov/llama.cpp/releases/tag/b4282) you will not be able to run the Q4_0_X_X files and will instead need to use Q4_0.
Additionally, if you want to get slightly better quality for , you can use IQ4_NL thanks to [this PR](https://github.com/ggerganov/llama.cpp/pull/10541) which will also repack the weights for ARM, though only the 4_4 for now. The loading time may be slower but it will result in an overall speed incrase.
<details>
<summary>Click to view Q4_0_X_X information (deprecated</summary>
I'm keeping this section to show the potential theoretical uplift in performance from using the Q4_0 with online repacking.
<details>
<summary>Click to view benchmarks on an AVX2 system (EPYC7702)</summary>
| model | size | params | backend | threads | test | t/s | % (vs Q4_0) |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |-------------: |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp512 | 204.03 ± 1.03 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp1024 | 282.92 ± 0.19 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp2048 | 259.49 ± 0.44 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg128 | 39.12 ± 0.27 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg256 | 39.31 ± 0.69 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg512 | 40.52 ± 0.03 | 100% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp512 | 301.02 ± 1.74 | 147% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp1024 | 287.23 ± 0.20 | 101% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp2048 | 262.77 ± 1.81 | 101% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg128 | 18.80 ± 0.99 | 48% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg256 | 24.46 ± 3.04 | 83% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg512 | 36.32 ± 3.59 | 90% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp512 | 271.71 ± 3.53 | 133% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp1024 | 279.86 ± 45.63 | 100% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp2048 | 320.77 ± 5.00 | 124% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg128 | 43.51 ± 0.05 | 111% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg256 | 43.35 ± 0.09 | 110% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg512 | 42.60 ± 0.31 | 105% |
Q4_0_8_8 offers a nice bump to prompt processing and a small bump to text generation
</details>
</details>
## Which file should I choose?
<details>
<summary>Click here for details</summary>
A great write up with charts showing various performances is provided by Artefact2 [here](https://gist.github.com/Artefact2/b5f810600771265fc1e39442288e8ec9)
The first thing to figure out is how big a model you can run. To do this, you'll need to figure out how much RAM and/or VRAM you have.
If you want your model running as FAST as possible, you'll want to fit the whole thing on your GPU's VRAM. Aim for a quant with a file size 1-2GB smaller than your GPU's total VRAM.
If you want the absolute maximum quality, add both your system RAM and your GPU's VRAM together, then similarly grab a quant with a file size 1-2GB Smaller than that total.
Next, you'll need to decide if you want to use an 'I-quant' or a 'K-quant'.
If you don't want to think too much, grab one of the K-quants. These are in format 'QX_K_X', like Q5_K_M.
If you want to get more into the weeds, you can check out this extremely useful feature chart:
[llama.cpp feature matrix](https://github.com/ggerganov/llama.cpp/wiki/Feature-matrix)
But basically, if you're aiming for below Q4, and you're running cuBLAS (Nvidia) or rocBLAS (AMD), you should look towards the I-quants. These are in format IQX_X, like IQ3_M. These are newer and offer better performance for their size.
These I-quants can also be used on CPU and Apple Metal, but will be slower than their K-quant equivalent, so speed vs performance is a tradeoff you'll have to decide.
The I-quants are *not* compatible with Vulcan, which is also AMD, so if you have an AMD card double check if you're using the rocBLAS build or the Vulcan build. At the time of writing this, LM Studio has a preview with ROCm support, and other inference engines have specific builds for ROCm.
</details>
## Credits
Thank you kalomaze and Dampf for assistance in creating the imatrix calibration dataset.
Thank you ZeroWw for the inspiration to experiment with embed/output.
Thank you to LM Studio for sponsoring my work.
Want to support my work? Visit my ko-fi page here: https://ko-fi.com/bartowski

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6a59a166fd91a5cd22522ed08c5da02384a26c83fb09d1ec03a3faa4bfa63f6e
size 2948319104

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:336659c4fa2a315dcfaeec2eae021fb9ba6b0d2bba73638ae0a45e98d3c9d42e
size 3784865920

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:92a837b5af4456511f28fe01dc5a72a8de7abce06c690d81fd9cf91dde9f0660
size 3518789760

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:41f6aef9258f3578e6e1749f168316b2bb02887ac664427b75c1aacd3197d86a
size 3274950528

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:05e9a26a86c0624dc49de09dc97824282881ee0ad0a6e1f04647bba76682f85c
size 4678035840

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:612927b4a16ba7b978f7c6af7993cf0a3892fb1a62a32d37a534745caefb0eac
size 4447708544

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7ae5906f614c48d206eb4e2859a5c67ba6ad9cd4b080ec3d500d5cb8af528733
size 3179170688

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:705b18bf977ab1ea603ab94fdda150044051359621d1a290031adf76122fa12a
size 3692226688

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8e9ebd08ac2e2099cc799aba1f2706074722cf2f45edc39601b01749646d27bc
size 4321998976

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f171c583a1ce16086ec2274ac92709909c901ffa0959c9c2d3891b68f7511c1f
size 4018960512

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4ed639005ed505c2fbdcfe15b353d6ff2d0ec74d141494e887f2bb10dcd188d0
size 3664541824

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:378e32e68a8681ecbb75ebc3a17b5ab4439718b888ce0122883eecaafc13b971
size 4781697152

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a17b931e19df2b7ffe5f518c6460d32b58da9ca16930d8de100ace8f9d4721af
size 4675938688

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3c400db20c8e47b44e1763c291f7f3a457ab5a4e0d129eb8bd1d6e7a480512ee
size 5130301824

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:46517d7ba0cc491c444359198d06518a3f6ff56a4d8de215ca769bc6df4815b9
size 5310703744

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5eae0f1a9bcdea7cad9f1d0d5ba7540bb3de3e2d72293c076a23f24db1c2c7da
size 4920781184

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:055fa46961d6faf80654689c48b2ade1f4e8d538504f0d7f315a3e10ec87edb6
size 4692715904

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0262804550b9579e125a5bc37c31b39868b83c6aa772095196526a0ab57b0ecf
size 6057289856

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1dd33cc6a7628ddf5b42d9e4b04024ac8ed0ee6b1116a38e774638d2693d37a1
size 5733038464

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:83360c67d3d2825fc6472e1f08534b1a1113bdc3650731f75fedc734317b78cc
size 5599345024

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4482dc6f92685218ee7c5e2c84fa25bc78c13528612f3101d7d5829d1e94e78f
size 6596061824

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ba24544fbb94acc0768e506a6c67de68edc01e2a73f1a39474810445c0b23b6a
size 6850537600

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7826c28d88b309e83acb4eb08fc105f2e761dd16c12b05f048ba6f45d16bfd69
size 8540842112

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:46f008a419bcde3ebca846e3604c7b159897b63d8e2c41e743e6d30663651b5d
size 4988170

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"framework": "pytorch", "task": "text-generation", "allow_remote": true}