commit 615a91016f93c028b23c6efaa11d41e1c011f90e Author: ModelHub XC Date: Sat Jun 13 10:42:15 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..342d307 --- /dev/null +++ b/README.md @@ -0,0 +1,247 @@ +--- +library_name: transformers +pipeline_tag: text-generation +license: apache-2.0 +language: + - en +base_model: Qwen/Qwen3-0.6B +datasets: + - 0xZee/dataset-CoT-Advanced-Calculus-268 + - 0xZee/dataset-CoT-Modern-Physics-177 + - 0xZee/dataset-CoT-Theoretical-Mechanics-307 + - 0xZee/dataset-CoT-Linear-Algebra-667 + - 0xZee/dataset-CoT-Electromagnetism-580 + - 0xZee/dataset-CoT-Molecular-Biology-71 + - 0xZee/dataset-CoT-Physiology-114 + - 0xZee/dataset-CoT-Classical-Mechanics-343 + - 0xZee/dataset-CoT-Differential-Equations-636 + - 0xZee/dataset-CoT-Physics-2254 + - 0xZee/dataset-CoT-Engineering-574 + - 0xZee/dataset-CoT-mathematics +tags: + - causal-lm + - text-generation + - distillation + - knowledge-distillation + - reasoning + - chain-of-thought + - mathematics + - physics + - engineering + - stem + - convergentintel + - edge +--- + +# Qwen3-0.6B STEM Proof Distilled (Thinking Teacher) + +A 0.6B parameter model distilled from Qwen3-30B-A3B-**Thinking** on 6,122 STEM chain-of-thought samples. 50x parameter compression. The Thinking variant teacher produces richer extended reasoning traces than the Instruct variant, transferring deeper deliberation structure into the smallest possible student. + +The result: a model under 500MB quantized that produces structured STEM derivations because a 30B thinking model showed it how to reason. + +> *"Structure beats scale."* +> — Convergent Intelligence LLC: Research Division + +## What Makes This Different + +Two key differences from standard small-model distillation: + +**1. Thinking teacher, not Instruct teacher.** The Qwen3-30B-A3B-Thinking variant generates extended internal reasoning before committing to an answer. Its softmax distributions are higher-entropy — it considers more reasoning paths at each step. At distillation temperature T=2.0, this means the 0.6B student sees a much richer landscape of alternative derivation strategies than it would from an Instruct teacher. The student doesn't just learn the answer — it learns the deliberation. + +**2. Proof-weighted loss.** Tokens inside the derivation region (`Proof:` to `Final Answer:`) receive 2.5x amplified loss, decaying to 1.5x over training. The model is penalized more for errors in reasoning steps than for errors in answer formatting. At 0.6B, every parameter has to count — proof weighting ensures they're allocated to reasoning capability, not boilerplate reproduction. + +## Model Details + +| Attribute | Value | +|---|---| +| **Architecture** | Qwen3 (causal LM, RoPE, GQA) | +| **Parameters** | 0.6B | +| **Base model** | [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) | +| **Teacher model** | [Qwen/Qwen3-30B-A3B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507) | +| **Compression ratio** | 50x (30B → 0.6B) | +| **Context length** | 1024 tokens (training) | +| **Precision** | bf16 | +| **License** | Apache 2.0 | +| **Developer** | Reaperdoesntrun / [Convergent Intelligence LLC](https://convergentintel.com): Research Division | + +## Training + +### Loss Function + +1. **Proof-Weighted Cross-Entropy (55%)** — Amplified weight on derivation tokens (2.5x → 1.5x linear decay) +2. **Knowledge Distillation KL Divergence (45%)** — Student/teacher softmax divergence at T=2.0, scaled by T² + +Combined: `L = 0.55 * CE_weighted + 0.45 * KD_kl` + +### Hyperparameters + +| Parameter | Value | +|---|---| +| Epochs | 1 | +| Training samples | 5,815 (95% of 6,122) | +| Eval samples | 307 (5% held out) | +| Effective batch size | 8 | +| Optimizer | AdamW (weight decay 0.01) | +| Learning rate | 1.5e-5 → 1e-6 (cosine, 30-step warmup) | +| Gradient clipping | 1.0 | +| Temperature | 2.0 | +| Proof weight | 2.5 → 1.5 | +| Precision | bf16 | + +### Dataset + +6,122 STEM CoT samples from 12 domains (Physics 2,254 / Linear Algebra 667 / Differential Equations 636 / Electromagnetism 580 / Mathematics 576 / Engineering 574 / Classical Mechanics 343 / Theoretical Mechanics 307 / Advanced Calculus 268 / Modern Physics 177 / Physiology 114 / Molecular Biology 71). All from [0xZee](https://huggingface.co/0xZee). + +### Training Format + +``` +Solve the following problem carefully and show a rigorous derivation. + +Problem: +{question} + +Proof: +{CoT} + +Final Answer: +{response} +``` + +## Usage + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +model_id = "reaperdoesntknow/Qwen3-0.6B-STEM-Proof-Distilled-Thinking" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, + device_map="auto", +) + +prompt = """Solve the following problem carefully and show a rigorous derivation. + +Problem: +Find the eigenvalues of the matrix [[3, 1], [0, 3]]. + +Proof: +""" + +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +## Intended Uses + +**Good for:** Lightweight STEM reasoning on edge/mobile devices, educational tutoring, proof drafting, component in multi-model pipelines where a small fast reasoner is needed, IoT and embedded inference. + +**Not for:** Formal proof verification, safety-critical analysis, medical or legal advice, or tasks requiring long-context reasoning beyond 1024 tokens. + +## Limitations + +0.6B is a hard capacity constraint. The model will struggle with multi-step proofs requiring more than ~8 reasoning steps, complex multi-variable problems, or domains underrepresented in training data (molecular biology, physiology). It will sometimes generate plausible but incorrect intermediate steps. Always verify. + + +## Mathematical Foundations: Discrepancy Calculus (DISC) + +This model is part of a distillation chain built on Discrepancy Calculus — a measure-theoretic framework where the teacher's output distribution is decomposed via the Mesh Fundamental Identity into smooth (AC), jump, and Cantor components. The discrepancy operator $Df(x) = \lim_{\varepsilon \downarrow 0} \frac{1}{\varepsilon} \int_x^{x+\varepsilon} \frac{|f(t) - f(x)|}{|t - x|} dt$ quantifies local structural mismatch that standard KL divergence averages away. + +Full theory: *"On the Formal Analysis of Discrepancy Calculus"* (Colca, 2026; Convergent Intelligence LLC: Research Division). Full methodology: [Structure Over Scale (DOI: 10.57967/hf/8165)](https://doi.org/10.57967/hf/8165). + +## Related Models + +| Model | Description | +|---|---| +| [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT) | This model + legal SFT | +| [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF) | Quantized for edge deployment | +| [Qwen3-1.7B-STEM-Proof-Distilled](https://huggingface.co/reaperdoesntknow/Qwen3-1.7B-STEM-Proof-Distilled) | Larger 1.7B variant (Instruct teacher) | + +## Citation + +```bibtex +@misc{colca2026distilled06b, + title={Qwen3-0.6B STEM Proof Distilled: 50x Compression from a Thinking Teacher}, + year={2026}, + publisher={HuggingFace}, + url={https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-STEM-Proof-Distilled-Thinking}, + note={Convergent Intelligence LLC: Research Division} +} +``` + +--- + +*Convergent Intelligence LLC: Research Division* +*"Where classical analysis fails to see, we begin."* + +--- + +## Convergent Intelligence Portfolio + +*Part of the [Qwen3 0.6B Distillation Series](https://huggingface.co/reaperdoesntknow) by [Convergent Intelligence LLC: Research Division](https://huggingface.co/reaperdoesntknow)* + + +# +## Mathematical Foundations: Discrepancy Calculus (DISC) + +This model is part of a distillation chain built on Discrepancy Calculus — a measure-theoretic framework where the teacher's output distribution is decomposed via the Mesh Fundamental Identity into smooth (AC), jump, and Cantor components. The discrepancy operator $Df(x) = \lim_{\varepsilon \downarrow 0} \frac{1}{\varepsilon} \int_x^{x+\varepsilon} \frac{|f(t) - f(x)|}{|t - x|} dt$ quantifies local structural mismatch that standard KL divergence averages away. + +Full theory: *"On the Formal Analysis of Discrepancy Calculus"* (Colca, 2026; Convergent Intelligence LLC: Research Division). Full methodology: [Structure Over Scale (DOI: 10.57967/hf/8165)](https://doi.org/10.57967/hf/8165). + +## Related Models + +| Model | Downloads | Format | +|-------|-----------|--------| +| [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT) | 33 | HF | +| [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF) | 203 | GGUF | + +### Top Models from Our Lab + +| Model | Downloads | +|-------|-----------| +| [Qwen3-1.7B-Thinking-Distil](https://huggingface.co/reaperdoesntknow/Qwen3-1.7B-Thinking-Distil) | 501 | +| [LFM2.5-1.2B-Distilled-SFT](https://huggingface.co/reaperdoesntknow/LFM2.5-1.2B-Distilled-SFT) | 342 | +| [Qwen3-1.7B-Coder-Distilled-SFT](https://huggingface.co/reaperdoesntknow/Qwen3-1.7B-Coder-Distilled-SFT) | 302 | +| [Qwen3-1.7B-Coder-Distilled-SFT-GGUF](https://huggingface.co/reaperdoesntknow/Qwen3-1.7B-Coder-Distilled-SFT-GGUF) | 194 | +| [Qwen3-1.7B-Distilled-30B-A3B-SFT-GGUF](https://huggingface.co/reaperdoesntknow/Qwen3-1.7B-Distilled-30B-A3B-SFT-GGUF) | 175 | + +**Total Portfolio: 41 models | 2,781 total downloads** + + +*Last updated: 2026-03-28 12:56 UTC* + + + +## DistilQwen Collection + +This model is part of the **[DistilQwen](https://huggingface.co/collections/reaperdoesntknow/distilqwen-69bf40ec669117e3f069ef1c)** proof-weighted distillation series. +Collection: **9 models** | **2,788 downloads** + +### Teacher Variant Comparison + +| Teacher | Student Size | Strength | Models | +|---------|-------------|----------|--------| +| Qwen3-30B-A3B (Instruct) | 1.7B | Instruction following, structured output, legal reasoning | 3 (833 DL) | +| Qwen3-30B-A3B (Thinking) | 0.6B | Extended deliberation, higher-entropy distributions, proof derivation | 3 (779 DL) **← this model** | +| Qwen3-30B-A3B (Coder) | 1.7B | Structured decomposition, STEM derivation, logical inference | 2 (825 DL) | + +### Methodology + +**The only BF16 collection in the portfolio.** While the broader Convergent Intelligence catalog (43 models, 12,000+ downloads) was trained on CPU at FP32 for $24 total compute, the DistilQwen series was trained on H100 at BF16 with a 30B-parameter teacher. Same methodology, premium hardware. This is what happens when you give the pipeline real compute. + +All models use proof-weighted knowledge distillation: 55% cross-entropy with decaying proof weights (2.5× → 1.5×), 45% KL divergence at T=2.0. The proof weight amplifies loss on reasoning-critical tokens, forcing the student to allocate capacity to structural understanding rather than surface-level pattern matching. + +Full methodology: [Structure Over Scale (DOI: 10.57967/hf/8165)](https://doi.org/10.57967/hf/8165) + +### Related in this series + +- [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT) (227 downloads) +- [Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF](https://huggingface.co/reaperdoesntknow/Qwen3-0.6B-Distilled-30B-A3B-Thinking-SFT-GGUF) (316 downloads) + + + + diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..01be9b3 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..cb42513 --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..6ef4252 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.0.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..06706b9 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb003b4471fd18fac80b98dce7e3310bb77f8af463d395db7a33ec3c16651bd2 +size 1503300328 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..7d75d3b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +}