Update metadata with huggingface_hub

This commit is contained in:
ai-modelscope
2025-02-18 06:47:20 +08:00
parent 31c74d6d1a
commit 796631f1a7
29 changed files with 290 additions and 63 deletions

56
.gitattributes vendored
View File

@@ -1,47 +1,61 @@
*.7z filter=lfs diff=lfs merge=lfs -text *.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-f16.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q6_K_L.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q5_K_L.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q4_K_L.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q4_1.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q3_K_XL.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q2_K_L.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct-f32.gguf filter=lfs diff=lfs merge=lfs -text
ilsp_Llama-Krikri-8B-Instruct.imatrix filter=lfs diff=lfs merge=lfs -text

216
README.md
View File

@@ -1,47 +1,181 @@
--- ---
license: Apache License 2.0 quantized_by: bartowski
pipeline_tag: text-generation
#model-type: license: llama3.1
##如 gpt、phi、llama、chatglm、baichuan 等 language:
#- gpt - el
- en
#domain: tags:
##如 nlp、cv、audio、multi-modal - text-generation-inference
#- nlp base_model: ilsp/Llama-Krikri-8B-Instruct
#language:
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
#- cn
#metrics:
##如 CIDEr、Blue、ROUGE 等
#- CIDEr
#tags:
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
#- pretrained
#tools:
##如 vllm、fastchat、llamacpp、AdaSeq 等
#- vllm
--- ---
### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
#### 您可以通过如下git clone命令或者ModelScope SDK来下载模型
SDK下载 ## Llamacpp imatrix Quantizations of Llama-Krikri-8B-Instruct by ilsp
```bash
#安装ModelScope Using <a href="https://github.com/ggerganov/llama.cpp/">llama.cpp</a> release <a href="https://github.com/ggerganov/llama.cpp/releases/tag/b4671">b4671</a> for quantization.
pip install modelscope
Original model: https://huggingface.co/ilsp/Llama-Krikri-8B-Instruct
All quants made using imatrix option with dataset from [here](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8)
Run them in [LM Studio](https://lmstudio.ai/)
Run them directly with [llama.cpp](https://github.com/ggerganov/llama.cpp), or any other llama.cpp based project
## Prompt format
``` ```
```python <|begin_of_text|><|start_header_id|>system<|end_header_id|>
#SDK模型下载
from modelscope import snapshot_download Cutting Knowledge Date: December 2023
model_dir = snapshot_download('bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF') Today Date: 26 Jul 2024
```
Git下载 {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
```
#Git模型下载 {prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
git clone https://www.modelscope.cn/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF.git
``` ```
<p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p> ## Download a file (not the whole branch) from below:
| Filename | Quant type | File Size | Split | Description |
| -------- | ---------- | --------- | ----- | ----------- |
| [Llama-Krikri-8B-Instruct-f32.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-f32.gguf) | f32 | 32.82GB | false | Full F32 weights. |
| [Llama-Krikri-8B-Instruct-f16.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-f16.gguf) | f16 | 16.41GB | false | Full F16 weights. |
| [Llama-Krikri-8B-Instruct-Q8_0.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q8_0.gguf) | Q8_0 | 8.72GB | false | Extremely high quality, generally unneeded but max available quant. |
| [Llama-Krikri-8B-Instruct-Q6_K_L.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q6_K_L.gguf) | Q6_K_L | 7.03GB | false | Uses Q8_0 for embed and output weights. Very high quality, near perfect, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q6_K.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q6_K.gguf) | Q6_K | 6.74GB | false | Very high quality, near perfect, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q5_K_L.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q5_K_L.gguf) | Q5_K_L | 6.24GB | false | Uses Q8_0 for embed and output weights. High quality, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q5_K_M.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q5_K_M.gguf) | Q5_K_M | 5.86GB | false | High quality, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q5_K_S.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q5_K_S.gguf) | Q5_K_S | 5.73GB | false | High quality, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q4_K_L.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q4_K_L.gguf) | Q4_K_L | 5.49GB | false | Uses Q8_0 for embed and output weights. Good quality, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q4_1.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q4_1.gguf) | Q4_1 | 5.26GB | false | Legacy format, similar performance to Q4_K_S but with improved tokens/watt on Apple silicon. |
| [Llama-Krikri-8B-Instruct-Q4_K_M.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q4_K_M.gguf) | Q4_K_M | 5.04GB | false | Good quality, default size for most use cases, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q3_K_XL.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q3_K_XL.gguf) | Q3_K_XL | 4.97GB | false | Uses Q8_0 for embed and output weights. Lower quality but usable, good for low RAM availability. |
| [Llama-Krikri-8B-Instruct-Q4_K_S.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q4_K_S.gguf) | Q4_K_S | 4.81GB | false | Slightly lower quality with more space savings, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q4_0.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q4_0.gguf) | Q4_0 | 4.80GB | false | Legacy format, offers online repacking for ARM and AVX CPU inference. |
| [Llama-Krikri-8B-Instruct-IQ4_NL.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ4_NL.gguf) | IQ4_NL | 4.80GB | false | Similar to IQ4_XS, but slightly larger. Offers online repacking for ARM CPU inference. |
| [Llama-Krikri-8B-Instruct-IQ4_XS.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ4_XS.gguf) | IQ4_XS | 4.57GB | false | Decent quality, smaller than Q4_K_S with similar performance, *recommended*. |
| [Llama-Krikri-8B-Instruct-Q3_K_L.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q3_K_L.gguf) | Q3_K_L | 4.43GB | false | Lower quality but usable, good for low RAM availability. |
| [Llama-Krikri-8B-Instruct-Q3_K_M.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q3_K_M.gguf) | Q3_K_M | 4.13GB | false | Low quality. |
| [Llama-Krikri-8B-Instruct-IQ3_M.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ3_M.gguf) | IQ3_M | 3.89GB | false | Medium-low quality, new method with decent performance comparable to Q3_K_M. |
| [Llama-Krikri-8B-Instruct-Q2_K_L.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q2_K_L.gguf) | Q2_K_L | 3.88GB | false | Uses Q8_0 for embed and output weights. Very low quality but surprisingly usable. |
| [Llama-Krikri-8B-Instruct-Q3_K_S.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q3_K_S.gguf) | Q3_K_S | 3.77GB | false | Low quality, not recommended. |
| [Llama-Krikri-8B-Instruct-IQ3_XS.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ3_XS.gguf) | IQ3_XS | 3.63GB | false | Lower quality, new method with decent performance, slightly better than Q3_K_S. |
| [Llama-Krikri-8B-Instruct-IQ3_XXS.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ3_XXS.gguf) | IQ3_XXS | 3.37GB | false | Lower quality, new method with decent performance, comparable to Q3 quants. |
| [Llama-Krikri-8B-Instruct-Q2_K.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-Q2_K.gguf) | Q2_K | 3.28GB | false | Very low quality but surprisingly usable. |
| [Llama-Krikri-8B-Instruct-IQ2_M.gguf](https://huggingface.co/bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF/blob/main/ilsp_Llama-Krikri-8B-Instruct-IQ2_M.gguf) | IQ2_M | 3.05GB | false | Relatively low quality, uses SOTA techniques to be surprisingly usable. |
## Embed/output weights
Some of these quants (Q3_K_XL, Q4_K_L etc) are the standard quantization method with the embeddings and output weights quantized to Q8_0 instead of what they would normally default to.
## Downloading using huggingface-cli
<details>
<summary>Click to view download instructions</summary>
First, make sure you have hugginface-cli installed:
```
pip install -U "huggingface_hub[cli]"
```
Then, you can target the specific file you want:
```
huggingface-cli download bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF --include "ilsp_Llama-Krikri-8B-Instruct-Q4_K_M.gguf" --local-dir ./
```
If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
```
huggingface-cli download bartowski/ilsp_Llama-Krikri-8B-Instruct-GGUF --include "ilsp_Llama-Krikri-8B-Instruct-Q8_0/*" --local-dir ./
```
You can either specify a new local-dir (ilsp_Llama-Krikri-8B-Instruct-Q8_0) or download them all in place (./)
</details>
## ARM/AVX information
Previously, you would download Q4_0_4_4/4_8/8_8, and these would have their weights interleaved in memory in order to improve performance on ARM and AVX machines by loading up more data in one pass.
Now, however, there is something called "online repacking" for weights. details in [this PR](https://github.com/ggerganov/llama.cpp/pull/9921). If you use Q4_0 and your hardware would benefit from repacking weights, it will do it automatically on the fly.
As of llama.cpp build [b4282](https://github.com/ggerganov/llama.cpp/releases/tag/b4282) you will not be able to run the Q4_0_X_X files and will instead need to use Q4_0.
Additionally, if you want to get slightly better quality for , you can use IQ4_NL thanks to [this PR](https://github.com/ggerganov/llama.cpp/pull/10541) which will also repack the weights for ARM, though only the 4_4 for now. The loading time may be slower but it will result in an overall speed incrase.
<details>
<summary>Click to view Q4_0_X_X information (deprecated</summary>
I'm keeping this section to show the potential theoretical uplift in performance from using the Q4_0 with online repacking.
<details>
<summary>Click to view benchmarks on an AVX2 system (EPYC7702)</summary>
| model | size | params | backend | threads | test | t/s | % (vs Q4_0) |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |-------------: |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp512 | 204.03 ± 1.03 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp1024 | 282.92 ± 0.19 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | pp2048 | 259.49 ± 0.44 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg128 | 39.12 ± 0.27 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg256 | 39.31 ± 0.69 | 100% |
| qwen2 3B Q4_0 | 1.70 GiB | 3.09 B | CPU | 64 | tg512 | 40.52 ± 0.03 | 100% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp512 | 301.02 ± 1.74 | 147% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp1024 | 287.23 ± 0.20 | 101% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | pp2048 | 262.77 ± 1.81 | 101% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg128 | 18.80 ± 0.99 | 48% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg256 | 24.46 ± 3.04 | 83% |
| qwen2 3B Q4_K_M | 1.79 GiB | 3.09 B | CPU | 64 | tg512 | 36.32 ± 3.59 | 90% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp512 | 271.71 ± 3.53 | 133% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp1024 | 279.86 ± 45.63 | 100% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | pp2048 | 320.77 ± 5.00 | 124% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg128 | 43.51 ± 0.05 | 111% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg256 | 43.35 ± 0.09 | 110% |
| qwen2 3B Q4_0_8_8 | 1.69 GiB | 3.09 B | CPU | 64 | tg512 | 42.60 ± 0.31 | 105% |
Q4_0_8_8 offers a nice bump to prompt processing and a small bump to text generation
</details>
</details>
## Which file should I choose?
<details>
<summary>Click here for details</summary>
A great write up with charts showing various performances is provided by Artefact2 [here](https://gist.github.com/Artefact2/b5f810600771265fc1e39442288e8ec9)
The first thing to figure out is how big a model you can run. To do this, you'll need to figure out how much RAM and/or VRAM you have.
If you want your model running as FAST as possible, you'll want to fit the whole thing on your GPU's VRAM. Aim for a quant with a file size 1-2GB smaller than your GPU's total VRAM.
If you want the absolute maximum quality, add both your system RAM and your GPU's VRAM together, then similarly grab a quant with a file size 1-2GB Smaller than that total.
Next, you'll need to decide if you want to use an 'I-quant' or a 'K-quant'.
If you don't want to think too much, grab one of the K-quants. These are in format 'QX_K_X', like Q5_K_M.
If you want to get more into the weeds, you can check out this extremely useful feature chart:
[llama.cpp feature matrix](https://github.com/ggerganov/llama.cpp/wiki/Feature-matrix)
But basically, if you're aiming for below Q4, and you're running cuBLAS (Nvidia) or rocBLAS (AMD), you should look towards the I-quants. These are in format IQX_X, like IQ3_M. These are newer and offer better performance for their size.
These I-quants can also be used on CPU and Apple Metal, but will be slower than their K-quant equivalent, so speed vs performance is a tradeoff you'll have to decide.
The I-quants are *not* compatible with Vulcan, which is also AMD, so if you have an AMD card double check if you're using the rocBLAS build or the Vulcan build. At the time of writing this, LM Studio has a preview with ROCm support, and other inference engines have specific builds for ROCm.
</details>
## Credits
Thank you kalomaze and Dampf for assistance in creating the imatrix calibration dataset.
Thank you ZeroWw for the inspiration to experiment with embed/output.
Thank you to LM Studio for sponsoring my work.
Want to support my work? Visit my ko-fi page here: https://ko-fi.com/bartowski

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"framework": "pytorch", "task": "text-generation", "allow_remote": true}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e61609ec7421b96ad0f12207baea322c262c165ecf4ff1d9beb9fd9af02ceb2d
size 3045854912

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:30ba6976855a719cec5917b1fac953fb49409d1446fe12be61b1a2b7359570c1
size 3893817024

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0e22cefd20d9092637ce1b6b92545ff735be30fb829ab54ba3f37bcbbedf5c31
size 3627740864

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0705c6b757785f4e2501b0a777a516adfcca64d86a6839d2d3dfd87ff608c0cf
size 3372486336

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7e552b70dca8dc0c5bba9c9700b4229a818498a0e0d60548e19e3bb6743cce00
size 4798402240

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b39dfb3421aca52a6c19fdca6a6d014111405a4dbaeb926a3b45d9d37d3c08b0
size 4565388992

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fd59d35c99ea8e68856599a11e97c20a06e599c25c340c78e206161728298531
size 3279392448

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8e014e110d0ac6066ddb5aabf908b8d2e76961baef8823c4627d8211f06eb1bb
size 3876384448

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:31c12f386f8c5a9dc472c7d7c6af7d11155b9149144eb40022daa34c394a07bd
size 4430950080

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5b15b2fecd53509a26a6a5a6919191be5ac913ac381accbb407cbab1db9ea403
size 4127911616

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ab54eb5076536b109f48c3c807d14faed89b5ff5cb87c7688641b82fe5f3c2e3
size 3773492928

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:44f08d1250c99352ae88a84c881594a18ce1a08b14fc4c6eb96616f08b9fcefe
size 4965854912

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f0377c2f01ac304967749d251cbb7c12f2ba70d1674eee29e52866a27e4edd5c
size 4796305088

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:141e3c704187e5913fb04b3516ba68a5d9d053ed9372335cec46632a1c199d9a
size 5256040128

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:514e7bee1ad8948a8fce3801b5e258f61bfafb7abdcd69b2be3537e90c018d87
size 5494861504

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0ae3a259f03ed79ba634a99ee3bfc672d785b5594b2f71053ed8cb760098abb6
size 5041147584

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0ff9f957a6f02785d422f783487accc832dc48e9bfff4931ecd32a95bbf9ac0a
size 4813082304

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5d724cf4bd778e4cc5c93dcf71e78f6d8f647e3b141d3758e582f33cd0788a62
size 6241447616

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1f673986edcda1bbd5870e72136103ff94da3d4dcd386bb51b6d2567901c2e9b
size 5864148672

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cd5deb4f5f2fc6d97475fbc3bec2edd12eed160a30d4636b8f64477aa0689ca1
size 5730455232

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3fc6ba6109d5cf05308b432ec3fd733c7ab030c8046464460b8941a234f1a539
size 6738587328

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ab191cc3a9c5e8138565e31cedc8942f2cb138aeafeddca0cc4ce20d4f903d7b
size 7034695360

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a491c4bea2c202151420f2bffc835d52c81762004ecdff6a301e1f69cc50d191
size 8724999872

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:97246cb99b80faac27f820cf6525175a901274a43de50da333c281b4d292294a
size 16414338752

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1ad325bf6946b76c5008f5c60dad2b009c66343127365845bc55c7a7095dba01
size 32818261376

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:093d27e00a456fc4073a202bfebbd555f5ac434b87308ad0eb38204bdb3edeb5
size 4988170