commit 0540ccdebd2b75f05b70536d171497d107827ad3 Author: ModelHub XC Date: Fri Jun 5 18:10:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: prism-ml/Ternary-Bonsai-4B-gguf Source: Original Platform diff --git a/.eval_results/gsm8k.yaml b/.eval_results/gsm8k.yaml new file mode 100644 index 0000000..25bdcf4 --- /dev/null +++ b/.eval_results/gsm8k.yaml @@ -0,0 +1,7 @@ +- dataset: + id: openai/gsm8k + task_id: gsm8k + value: 90.5 + source: + url: https://huggingface.co/prism-ml/Bonsai-8B-gguf + name: Model Card \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dd0dc75 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,40 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +Ternary-Bonsai-4B-F16.gguf filter=lfs diff=lfs merge=lfs -text +Ternary-Bonsai-4B-Q2_0.gguf filter=lfs diff=lfs merge=lfs -text +Ternary-Bonsai-4B-TQ1_0.gguf filter=lfs diff=lfs merge=lfs -text +Ternary-Bonsai-4B-TQ2_0.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..66a27ec --- /dev/null +++ b/LICENSE @@ -0,0 +1,177 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..2c102a7 --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,4 @@ +This software is copyright 2026-present Prism ML, Inc. It is available under the Apache 2.0 license. +If you publicly deploy or redistribute this software, we would appreciate attribution such as: “Created using Bonsai by Prism ML." + +This software is built from Qwen3-4B, Copyright 2024 Alibaba Cloud, which is available under the Apache 2.0 License: https://huggingface.co/Qwen/Qwen3-4B/blob/main/LICENSE \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..863b976 --- /dev/null +++ b/README.md @@ -0,0 +1,165 @@ +--- +license: apache-2.0 +library_name: gguf +pipeline_tag: text-generation +tags: +- ternary +- 1.58-bit +- gguf +- llama-cpp +- q2_0 +- on-device +- prismml +- bonsai +base_model: +- prism-ml/Ternary-Bonsai-4B-unpacked +--- + +

+ Bonsai +

+ +

+ Prism ML Website  |  + White Paper  |  + Demo & Examples  |  + Discord +

+ +# Ternary-Bonsai-4B-gguf + +Ternary (1.58-bit) language model in GGUF Q2_0 format for `llama.cpp` + +

+ Pareto Frontier +

+ +## Resources + +- **[White Paper](https://github.com/PrismML-Eng/Bonsai-demo/blob/main/ternary-bonsai-8b-whitepaper.pdf)** +- **[Demo repo](https://github.com/PrismML-Eng/Bonsai-demo)** — examples for serving, benchmarking, and integrating Bonsai +- **[Discord](https://discord.gg/prismml)** — community support and updates +- **Kernels**: Q2_0 is not yet in mainline `llama.cpp`. Use our fork at [PrismML-Eng/llama.cpp](https://github.com/PrismML-Eng/llama.cpp) (`prism` branch, default) which adds Q2_0 support for CPU (NEON/generic) and Metal. Upstream PR coming soon. + +## Model Overview + +| Item | Specification | +| :--------------- | :----------------------------------------------------------------------- | +| Base model | Qwen3-4B | +| Parameters | 4.0B (~3.6B non-embedding) | +| Architecture | GQA (32 query / 8 KV heads), SwiGLU MLP, RoPE, RMSNorm | +| Layers | 36 Transformer decoder blocks | +| Context length | 32,768 tokens | +| Vocab size | 151,936 | +| Weight format | GGUF Q2_0 g128: {-1, 0, +1} with FP16 group-wise scaling | +| Packed Q2_0 size | **1,020 MiB** (1.07 GB) | +| Ternary coverage | Embeddings, attention projections, MLP projections, LM head | +| License | Apache 2.0 | + +## Quantization Format: GGUF Q2_0 (g128) + +Each weight takes a value from {-1, 0, +1}, with one shared FP16 scale per group of 128 weights: + +``` +w_i = scale_g * t_i, t_i in {-1, 0, +1} +``` + +Q2_0 encodes each weight as a 2-bit code `q in {0, 1, 2, 3}`, dequantized via `w = (q - 1) * scale`. One 128-element block is 34 bytes (2 bytes FP16 scale + 32 bytes of packed 2-bit codes) for an effective **2.125 bits/weight**. The fourth code point (`q = 3`, reconstructing to `+2 * scale`) is reserved for future extensions; for ternary weights it is unused. + +### Memory + +| Format | Size | Reduction | Ratio | +| :---------------- | ----------: | --------: | ---------: | +| FP16 | 8.04 GB | -- | 1.0x | +| **GGUF Q2_0 g128**| **1,020 MiB** (1.07 GB) | **86.3%** | **7.3x** | + +## Files in this repo + +| File | Format | Size | Recommended | +| :------------------------------ | :----- | -----: | :---------- | +| `Ternary-Bonsai-4B-F16.gguf` | FP16 | 8.04 GB | baseline / re-quantization source | +| `Ternary-Bonsai-4B-Q2_0.gguf` | Q2_0 (g128) | 1,020 MB | **recommended** (lossless for ternary) | + +## Quickstart + +### Build from the Prism fork + +```bash +git clone https://github.com/PrismML-Eng/llama.cpp +cd llama.cpp +cmake -B build -DGGML_METAL=ON # or -DGGML_CUDA=ON, -DGGML_VULKAN=ON +cmake --build build -j +``` + +### `llama.cpp` CLI + +```bash +./build/bin/llama-cli \ + -m Ternary-Bonsai-4B-Q2_0.gguf \ + -p "Explain quantum computing in simple terms." \ + -n 256 +``` + +### `llama.cpp` server + +```bash +./build/bin/llama-server -m Ternary-Bonsai-4B-Q2_0.gguf -c 4096 +``` + + +## Throughput (llama.cpp, Apple M4 Pro 48 GB) + +| Backend | PP512 (tok/s) | TG128 (tok/s) | +| :--------------- | ------------: | ------------: | +| Metal (GPU) | 826 | **120** | +| NEON CPU (10 t) | 226 | **56** | + +Flags: `-ngl 99 -fa 1` for Metal; `-ngl 0 -fa 1 -t 10` for CPU. + +## Fidelity (Q2_0 vs FP16 baseline) + +Q2_0 is effectively lossless for ternary weights — the ternary values land exactly on three of the four 2-bit code points, so quantize/dequantize is bit-exact in the absence of FP16 scale rounding. + +## Benchmarks + +Evaluated with EvalScope v1.4.2 + vLLM 0.15.1 on NVIDIA H100. Full benchmark suite: + +| Model | Size | Avg | MMLU-R | MuSR | IFEval | GSM8K | HE+ | BFCLv3 | +| :------------------------ | ----------: | --------: | -----: | ---: | -----: | ----: | ---: | -----: | +| **Ternary Bonsai 4B** | **1.02 GB** | **70.7** | 69.7 | 45.1 | 72.1 | 90.5 | 78.7 | 67.8 | +| *1-bit Bonsai 4B (prior)* | *0.57 GB* | *62.7* | 58.7 | 41.4 | 69.6 | 87.3 | 71.3 | 48.0 | +| Qwen 3 4B | 8.04 GB | **77.1** | 79.8 | 57.4 | 80.0 | 92.1 | 74.4 | 78.9 | +| Ministral3 3B | 6.86 GB | **73.2** | 77.5 | 56.5 | 73.1 | 91.4 | 69.5 | 71.3 | +| Gemma 3 4B | 7.76 GB | **67.9** | 66.0 | 46.3 | 73.0 | 89.8 | 67.1 | 65.1 | +| Llama 3.2 3B | 6.43 GB | **64.4** | 65.5 | 48.9 | 78.3 | 80.1 | 52.4 | 60.9 | + +## Intelligence Density + +``` +density = -ln(1 - score/100) / size_GB +``` + +| Model | Size | Intelligence Density (1/GB) | +| :------------------------ | ----------: | --------------------------: | +| **Ternary Bonsai 4B** | **1.02 GB** | **1.202** | +| *1-bit Bonsai 4B (prior)* | *0.57 GB* | *1.744* | +| Ministral3 3B | 6.86 GB | 0.192 | +| Qwen 3 4B | 8.04 GB | 0.183 | +| Llama 3.2 3B | 6.43 GB | 0.161 | +| Gemma 3 4B | 7.76 GB | 0.146 | + +## Citation + +```bibtex +@techreport{ternarybonsai, + title = {Ternary Bonsai: 1.58-bit Language Models at 8B, 4B, and 1.7B Scale}, + author = {Prism ML}, + year = {2026}, + month = {April}, + url = {https://prismml.com} +} +``` + +## Contact + +For questions, feedback, or collaboration inquiries: **contact@prismml.com** \ No newline at end of file diff --git a/Ternary-Bonsai-4B-F16.gguf b/Ternary-Bonsai-4B-F16.gguf new file mode 100644 index 0000000..a2c1ac9 --- /dev/null +++ b/Ternary-Bonsai-4B-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36bb7f8277a715eeb7ab306fd27d9d4e9abb078c92717856c3d3415777362f5c +size 8049911840 diff --git a/Ternary-Bonsai-4B-Q2_0.gguf b/Ternary-Bonsai-4B-Q2_0.gguf new file mode 100644 index 0000000..0273932 --- /dev/null +++ b/Ternary-Bonsai-4B-Q2_0.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0bf8b737b0431552f8c2c97695ab7c0cb214c94bcdeb4f5f267e67ddf28b8b +size 1074969344 diff --git a/assets/bonsai-logo.svg b/assets/bonsai-logo.svg new file mode 100644 index 0000000..6e2df3a --- /dev/null +++ b/assets/bonsai-logo.svg @@ -0,0 +1 @@ + diff --git a/assets/frontier.png b/assets/frontier.png new file mode 100644 index 0000000..6035ebf Binary files /dev/null and b/assets/frontier.png differ diff --git a/assets/frontier.svg b/assets/frontier.svg new file mode 100644 index 0000000..034dfe9 --- /dev/null +++ b/assets/frontier.svg @@ -0,0 +1,172 @@ + + + + + + + + + + + + + + + + + + Frontier efficiency + Average score (IFEval, GSM8K, HumanEval+, BFCL, MuSR, MMLU-Redux) + + + + + + + + + + + + + + + + + + + + + 40 + 50 + 60 + 70 + 80 + + + + + + + + + + 0.25 GB + 0.5 GB + 1 GB + 2 GB + 4 GB + 8 GB + 16 GB + + + + + + + + + + + + Model size in GB (log scale) + Average benchmark score + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1-Bit Bonsai 1.7B + 1-Bit Bonsai 4B + 1-Bit Bonsai 8B + + + + + + + + Ternary Bonsai 1.7B + Ternary Bonsai 4B + Ternary Bonsai 8B + + + Qwen3 0.6B + Qwen3 1.7B + Ministral3 3B + Qwen3 4B + Qwen3 8B +