From 1f72039b0e7bbb24e4b74bca61a69a69cf0f39aa Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 24 Jun 2026 11:18:20 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: EphAsad/Atem-0.6B Source: Original Platform --- .gitattributes | 43 +++++ Atem-0.6b.Q4_K_M.gguf | 3 + Atem-0.6b.Q5_K_M.gguf | 3 + Atem-0.6b.Q8_0.gguf | 3 + Logo.png | 3 + Modelfile | 59 +++++++ README.md | 358 +++++++++++++++++++++++++++++++++++++++++ chat_template.jinja | 101 ++++++++++++ config.json | 64 ++++++++ generation_config.json | 13 ++ model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 234 +++++++++++++++++++++++++++ 13 files changed, 890 insertions(+) create mode 100644 .gitattributes create mode 100644 Atem-0.6b.Q4_K_M.gguf create mode 100644 Atem-0.6b.Q5_K_M.gguf create mode 100644 Atem-0.6b.Q8_0.gguf create mode 100644 Logo.png create mode 100644 Modelfile create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e595e95 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,43 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +qwen3-0.6b.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text +qwen3-0.6b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text +qwen3-0.6b.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Atem-0.6b.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Atem-0.6b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text +Atem-0.6b.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text +Logo.png filter=lfs diff=lfs merge=lfs -text diff --git a/Atem-0.6b.Q4_K_M.gguf b/Atem-0.6b.Q4_K_M.gguf new file mode 100644 index 0000000..1c88960 --- /dev/null +++ b/Atem-0.6b.Q4_K_M.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddef2fb68fbf63a4ac00716aad48647f5da3249513b771cb8a32a12c5b9202e1 +size 396705696 diff --git a/Atem-0.6b.Q5_K_M.gguf b/Atem-0.6b.Q5_K_M.gguf new file mode 100644 index 0000000..5483c58 --- /dev/null +++ b/Atem-0.6b.Q5_K_M.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dae12e3543d08aa81ab2244d54226b4a6afc713682a0ced185b2474a9dd9354 +size 444415904 diff --git a/Atem-0.6b.Q8_0.gguf b/Atem-0.6b.Q8_0.gguf new file mode 100644 index 0000000..acb8410 --- /dev/null +++ b/Atem-0.6b.Q8_0.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c17322b60c7e36d17149967844fe53881f9720e7125a8a2a58048c84ab8221 +size 639447968 diff --git a/Logo.png b/Logo.png new file mode 100644 index 0000000..2f31b1d --- /dev/null +++ b/Logo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:036d268ac79d1a5355a5ea602fc02ba312f4938c3506ddfa148ec7de44b69607 +size 981796 diff --git a/Modelfile b/Modelfile new file mode 100644 index 0000000..b4da25b --- /dev/null +++ b/Modelfile @@ -0,0 +1,59 @@ + +FROM qwen3-0.6b.Q8_0.gguf +TEMPLATE """{{- if .Messages }} +{{- if or .System .Tools }}<|im_start|>system +{{- if .System }} +{{ .System }} +{{- end }} +{{- if .Tools }} + +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{{- range .Tools }} +{"type": "function", "function": {{ .Function }}} +{{- end }} + + +For each function call, return a json object with function name and arguments within XML tags: + +{"name": , "arguments": } + +{{- end }}<|im_end|> +{{ end }} +{{- range $i, $_ := .Messages }} +{{- $last := eq (len (slice $.Messages $i)) 1 -}} +{{- if eq .Role "user" }}<|im_start|>user +{{ .Content }}<|im_end|> +{{ else if eq .Role "assistant" }}<|im_start|>assistant +{{ if .Content }}{{ .Content }} +{{- else if .ToolCalls }} +{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} +{{ end }} +{{- end }}{{ if not $last }}<|im_end|> +{{ end }} +{{- else if eq .Role "tool" }}<|im_start|>user + +{{ .Content }} +<|im_end|> +{{ end }} +{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant +{{ end }} +{{- end }} +{{- else }} +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }}{{ if .Prompt }}<|im_start|>user +{{ .Prompt }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}""" +PARAMETER stop "<|im_end|>" +PARAMETER stop "<|im_start|>" +PARAMETER temperature 0.6 +PARAMETER min_p 0.0 +PARAMETER top_k 20 +PARAMETER top_p 0.95 +PARAMETER repeat_penalty 1 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..95ab122 --- /dev/null +++ b/README.md @@ -0,0 +1,358 @@ +--- +license: apache-2.0 +base_model: Qwen/Qwen3-0.6B +tags: +- unsloth +- lora +- qwen3 +- reasoning +- distillation +- conversational +datasets: +- EphAsad/QWENMillenium-SF +- EphAsad/Phi4Millennium-SF +- EphAsad/MistralMillenium-SF +- Modotte/CodeX-2M-Thinking +- Jackrong/Kimi-K2.5-Reasoning-1M-Cleaned +- WithinUsAI/MiniMax_M2.7_Distilled_5k +- tuanha1305/DeepSeek-R1-Distill +- open-r1/OpenThoughts-114k-math +- flytech/python-codes-25k +- FreedomIntelligence/medical-o1-reasoning-SFT +- Jackrong/Claude-opus-4.7-TraceInversion-5000x +language: +- en +pipeline_tag: text-generation +library_name: transformers +--- + +![Atem Logo](https://huggingface.co/EphAsad/Atem-0.6B/resolve/main/Logo.png) + +# Atem-0.6B + +*Ancient logic. Modern intelligence.* + +A 0.6B reasoning model trained via multi-source knowledge distillation from frontier teacher models. + +![Base Model](https://img.shields.io/badge/Base-Qwen3--0.6B-blue)![Method](https://img.shields.io/badge/Method-LoRA%20SFT-purple)![Parameters](https://img.shields.io/badge/Parameters-0.6B-orange)![License](https://img.shields.io/badge/License-Apache%202.0-green) + +--- + +## Overview + +Atem-0.6B is a 0.6B parameter reasoning model built via supervised fine-tuning on a curated corpus of approximately 120,000 examples distilled from multiple frontier teacher models. Starting from Qwen/Qwen3-0.6B, Atem was trained using LoRA to preserve base model capabilities while shifting output style toward clean, directly-formatted final answers. + +This is **Stage 1** of a planned multi-stage training series, and the first entry in the Atem family built on Qwen3 rather than Qwen2.5. Stage 1 strips `` reasoning traces from all training data, deliberately suppressing Qwen3's native exposed chain-of-thought in favor of direct answers. **Stage 2 (Atem-Savant-0.6B) is currently in progress**, layering curated chain-of-thought traces back on top of this foundation — see [Known Limitations](#known-limitations) for why that stage matters. + +--- + +## Model Details + +| Property | Value | +| ------------------------ | ---------------------------------------- | +| **Base model** | Qwen/Qwen3-0.6B | +| **Training method** | LoRA Supervised Fine-Tuning (Stage 1) | +| **LoRA config** | r=32, alpha=64, dropout=0.05 | +| **Target modules** | q, k, v, o, gate, up, down projections | +| **Parameters** | ~596M | +| **Trainable (LoRA) params** | 20,185,088 (3.28% of base) | +| **Training records** | 120,017 | +| **Epochs** | 2 | +| **Effective batch size** | 128 (batch 32 × grad accum 4) | +| **Learning rate** | 2e-4, cosine schedule, 5% warmup | +| **Final train loss** | 1.055 | +| **Final val loss** | 1.073 | +| **Hardware** | NVIDIA A100-SXM4 80GB | +| **Max sequence length** | 4,096 tokens | +| **Precision** | bfloat16 | +| **License** | Apache 2.0 | + +--- + +## Intended Use + +Atem-0.6B is designed for lightweight, open-ended reasoning tasks where structured, direct answers add value at low compute cost: + +- Code explanation, implementation, and debugging +- Mathematical problem solving with working shown +- Analytical reasoning and hypothesis evaluation +- Concept explanation and comparative analysis +- Logic, argument, and fallacy identification + +Atem-0.6B is **not** designed for retrieval-heavy factual lookup, real-time information, or tasks requiring broad knowledge breadth beyond its training domains. At 0.6B parameters its capability ceiling is naturally lower than larger Atem models — expect it to be most useful where speed and footprint matter more than depth on hard, multi-step problems. + +--- + +## Training Data + +Atem-0.6B was trained on a corpus assembled from eleven sources, combining domain-specific generated datasets and publicly available distillation datasets from frontier models. All outputs containing `` reasoning traces were stripped to clean final responses for Stage 1 training. + +| Dataset | Records | Source / Teacher | +| --------------------------------------------- | ------------ | ----------------------------------------------------- | +| EphAsad/QWENMillenium-SF | ~ | Qwen2.5-14B — Analytical & Scientific | +| EphAsad/Phi4Millennium-SF | ~ | Phi-4 14B — Mathematical Reasoning | +| EphAsad/MistralMillenium-SF | ~ | Mistral-Nemo-12B — Language & Comprehension | +| Modotte/CodeX-2M-Thinking | 40,000 | Mixed — Coding | +| Jackrong/Kimi-K2.5-Reasoning-1M-Cleaned | 23,000 | Kimi K2.5 — General Distillation (English filtered) | +| WithinUsAI/MiniMax_M2.7_Distilled_5k | 5,000 | MiniMax M2.7 | +| tuanha1305/DeepSeek-R1-Distill | 9,000 | DeepSeek-R1 | +| open-r1/OpenThoughts-114k-math | 10,000 | Mixed — Mathematics (correct answers only) | +| flytech/python-codes-25k | 10,000 | Python coding | +| FreedomIntelligence/medical-o1-reasoning-SFT | 10,000 | Medical reasoning (English config) | +| Jackrong/Claude-opus-4.7-TraceInversion-5000x | 5,000 | Claude Opus 4.7 — Trace Inversion | +| **Total** | **120,017** | | + +--- + +## Training Configuration + +```python +# Key hyperparameters +lora_r = 32 +lora_alpha = 64 +lora_dropout = 0.05 +max_seq_length = 4096 +learning_rate = 2e-4 +lr_scheduler = 'cosine' +warmup_ratio = 0.05 +batch_size = 32 +grad_accumulation = 4 # effective batch size: 128 +num_epochs = 2 +dtype = bfloat16 +load_in_4bit = True # during training + +``` + +Training used Unsloth with `train_on_responses_only` masking, ensuring loss was computed exclusively on assistant response tokens. Because Qwen3 ships with no default system prompt (unlike Qwen2.5-Instruct), Atem's identity is baked in via a chat-template modification that injects Atem as the default persona only when no explicit system message is supplied — explicit system messages still take priority. A pre-training validation suite verified this injection, confirmed the response-masking boundary correctly accounts for Qwen3's automatic empty `` scaffold insertion, and checked for leaked reasoning content before training began. + +After training, LoRA adapters were merged into the base weights and exported as a full merged model. + +**Loss curve:** + +| Step | Train Loss | Val Loss | +| ----- | ---------- | --------- | +| 200 | 1.166 | 1.163 | +| 800 | 1.108 | 1.096 | +| 1400 | 0.983 | 1.077 | +| Final (1876) | **1.055** | **1.073** | + +Validation loss plateaued around step 1600 of 1876 total steps — the final ~15% of training produced only marginal further improvement (1.074 → 1.073). Train loss showed some batch-to-batch volatility late in training (a step-1800 spike to 1.088, consistent with the dataset's domain diversity rather than divergence), but validation loss stayed smooth and never reversed, indicating no overfitting across the two epochs. + +--- + +## Evaluation + +### Benchmark Results + +Evaluated against the base model (`unsloth/qwen3-0.6b-unsloth-bnb-4bit`) using lm-evaluation-harness. + +| Task | Base (Qwen3-0.6B) | Atem-0.6B | Delta | +| ------------------------ | ------------------ | ---------- | ----------- | +| ARC-Challenge (0-shot, acc_norm) | 33.0% | 35.0% | +2.0% ✓ | +| GSM8K (5-shot, strict-match) | 26.7% | **31.8%** | **+5.1%** ✓ | +| HellaSwag (0-shot, acc_norm) | 45.3% | 45.8% | +0.5% | + +**Eval condition note:** the base model was loaded in 4-bit (`unsloth/qwen3-0.6b-unsloth-bnb-4bit`); Atem-0.6B was evaluated as the full bfloat16 merged model. This is not a precision-matched comparison — the gap may be modestly inflated relative to a 4-bit-vs-4-bit or bf16-vs-bf16 run. GSM8K used 5-shot prompting per lm-eval's default config; ARC-Challenge and HellaSwag were 0-shot. + +The GSM8K gain is the standout figure, but it likely reflects Stage 1's training toward clean, directly-formatted final answers — which matters a great deal for lm-eval's exact-match-on-extracted-number scoring — more than a deeper improvement in multi-step mathematical reasoning. The qualitative evaluation below, which looks at harder, less templated problems, supports this reading: reasoning depth on multi-step problems is not uniformly better than the base model. ARC-Challenge and HellaSwag, which probe general/commonsense knowledge rather than output formatting, moved only slightly — expected, since SFT on this corpus isn't designed to add new general knowledge. + +### Qualitative Evaluation + +Atem-0.6B was evaluated against base Qwen3-0.6B (default thinking-enabled) across 30 domain-representative questions with matched system prompts. + +| Domain | Questions | Outcome | +| --------------------- | --------- | ----------------------------------------------------------------------------------------- | +| Coding | 8 | Mixed — comparable correctness; Atem notably more concise and direct | +| Mathematics | 6 | Mixed — base model's exposed reasoning self-corrects mid-generation on some multi-step problems that Atem commits to an error on | +| Analytical Reasoning | 6 | Base model edges ahead — exposed reasoning gives more room to work through multi-step arguments | +| General Knowledge | 5 | Comparable | +| Language & Logic | 5 | Comparable, slight edge to base on illustrative examples | + +Atem-0.6B's outputs were consistently more concise and directly formatted — a direct result of Stage 1's design goal of suppressing exposed chain-of-thought. This did **not** translate into a uniform quality advantage over the base model: on problems requiring several sequential reasoning steps, the base model's visible thinking trace sometimes catches and corrects mistakes mid-generation that Atem, having no scratchpad, does not. This is the expected cost of the no-think format rather than a knowledge regression, and is the explicit target of the in-progress Stage 2 (Atem-Savant-0.6B) training, which reintroduces chain-of-thought. + +One output during qualitative testing showed repetitive/degenerate text (a duplicated bullet list) on a single open-ended analytical question — noted here for transparency rather than treated as representative. + +--- + +## Usage + +### Transformers + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +model_name = "EphAsad/Atem-0.6B" + +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="auto" +) + +messages = [ + { + "role": "user", + "content": "Write a Python function that checks whether a number is prime." + } +] + +inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_tensors="pt" +).to(model.device) + +with torch.no_grad(): + output = model.generate( + input_ids=inputs, + max_new_tokens=1000, + temperature=0.7, + top_p=0.9, + repetition_penalty=1.1, + do_sample=True, + ) + +response = tokenizer.decode( + output[0][inputs.shape[1]:], + skip_special_tokens=True +) +print(response) + +``` + +### Unsloth (faster inference) + +```python +from unsloth import FastLanguageModel +import torch + +model, tokenizer = FastLanguageModel.from_pretrained( + model_name="EphAsad/Atem-0.6B", + max_seq_length=4096, + dtype=torch.bfloat16, + load_in_4bit=True, +) +FastLanguageModel.for_inference(model) + +messages = [ + { + "role": "user", + "content": "Explain the difference between a stack and a queue, with examples." + } +] + +inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_tensors="pt" +).to("cuda") + +with torch.no_grad(): + output = model.generate( + input_ids=inputs, + max_new_tokens=1000, + temperature=0.7, + top_p=0.9, + do_sample=True, + ) + +print(tokenizer.decode( + output[0][inputs.shape[1]:], + skip_special_tokens=True +)) + +``` + +### Ollama + +```bash +# Recommended — best speed/quality balance +ollama run hf.co/EphAsad/Atem-0.6B:Q4_K_M + +# Higher quality +ollama run hf.co/EphAsad/Atem-0.6B:Q5_K_M + +# Near-lossless +ollama run hf.co/EphAsad/Atem-0.6B:Q8_0 + +``` + +### llama.cpp + +```bash +llama-server -hf EphAsad/Atem-0.6B:Q4_K_M + +``` + +### System Prompt + +Atem-0.6B's identity is baked into the chat template and activates automatically when no system message is provided. For manual override: + +``` +You are Atem, a precise and analytical reasoning assistant. You approach +every problem methodically — identifying core concepts, reasoning step by +step, and arriving at well-supported conclusions. You show your thinking +clearly and are thorough, direct, and intellectually honest. + +``` + +### Available Files + +| File | Size | Description | +| --------------------------- | ---------- | ----------------------------------- | +| `model.safetensors` | ~1.2 GB | Full bfloat16 merged weights | +| `Atem-0.6b.Q4_K_M.gguf` | ~397 MB | 4-bit quantised — recommended | +| `Atem-0.6b.Q5_K_M.gguf` | ~444 MB | 5-bit quantised | +| `Atem-0.6b.Q8_0.gguf` | ~700 MB | 8-bit quantised — near-lossless | + +--- + +## Known Limitations + +**No thinking traces (Stage 1 by design).** Think tags were stripped from all training data for Stage 1, and Qwen3's native exposed reasoning is suppressed. The model does not produce extended `` content. As shown in the qualitative evaluation above, this measurably costs accuracy on multi-step analytical and mathematical problems relative to the base model's default thinking-enabled behavior — Stage 2 (Atem-Savant-0.6B, in progress) exists specifically to recover this. + +**Smaller capability ceiling than larger Atem models.** At 0.6B parameters, this is the smallest model in the Atem family. Treat it as a fast, low-footprint option rather than a reasoning-depth flagship. + +**Mathematical precision on complex problems.** On multi-step calculations, the model may make arithmetic or counting errors without a scratchpad to catch them — verified directly in qualitative testing (e.g., miscounting combinatorial outcomes). Answers to high-stakes mathematical problems should be independently verified. + +**Eval precision asymmetry.** The benchmark comparison above evaluated the base model in 4-bit and Atem-0.6B in bfloat16 — see the Evaluation section for details. A precision-matched re-run would give a cleaner comparison. + +--- + +## Roadmap + +Atem-0.6B establishes the Stage 1 foundation for the Qwen3-based branch of the Atem family. Planned next steps: + +- **Stage 2 (in progress):** Atem-Savant-0.6B — LoRA SFT on curated chain-of-thought data (~90% think-trace records, ~10% no-think) using OpenR1-Math, Kimi-K2.5, DeepSeek-V4-Pro-Reasoning, OpenCodeReasoning, and trace-inversion datasets, to recover multi-step reasoning depth on top of Stage 1's direct-answer foundation +- **Extended benchmarks:** MMLU, BBH, IFEval post-Stage 2 +- **Precision-matched re-benchmark:** re-run base vs Atem comparison under identical 4-bit (or identical bf16) conditions + +--- + +## Citation + +```bibtex +@misc{atem_06b_2026, + author = {Asad, Zain}, + title = {Atem-0.6B: A 0.6B Direct-Reasoning Model via + Stage 1 SFT on Qwen3}, + year = {2026}, + publisher = {HuggingFace}, + howpublished = {\url{https://huggingface.co/EphAsad/Atem-0.6B}}, +} + +``` + +--- + +## License + +Released under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0), consistent with the base model Qwen/Qwen3-0.6B. + +--- + +Built independently by Zain Asad - [EphAsad](https://huggingface.co/EphAsad) \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..293244b --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,101 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\n' + 'You are Atem, a precise and analytical reasoning assistant. You approach every problem methodically — identifying core concepts, reasoning step by step, and arriving at well-supported conclusions. You show your thinking clearly and are thorough, direct, and intellectually honest.' + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for forward_message in messages %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- set message = messages[index] %} + {%- set current_content = message.content if message.content is defined and message.content is not none else '' %} + {%- set tool_start = '' %} + {%- set tool_start_length = tool_start|length %} + {%- set start_of_message = current_content[:tool_start_length] %} + {%- set tool_end = '' %} + {%- set tool_end_length = tool_end|length %} + {%- set start_pos = (current_content|length) - tool_end_length %} + {%- if start_pos < 0 %} + {%- set start_pos = 0 %} + {%- endif %} + {%- set end_of_message = current_content[start_pos:] %} + {%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set m_content = message.content if message.content is defined and message.content is not none else '' %} + {%- set content = m_content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in m_content %} + {%- set content = (m_content.split('')|last).lstrip('\n') %} + {%- set reasoning_content = (m_content.split('')|first).rstrip('\n') %} + {%- set reasoning_content = (reasoning_content.split('')|last).lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..ff8229a --- /dev/null +++ b/config.json @@ -0,0 +1,64 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "torch_dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151669, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "unsloth_version": "2026.5.5", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..a9abf9b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,13 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "max_length": 40960, + "pad_token_id": 151669, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.5.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..9339765 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f0e3ed85d13b43971c3d1ed20e42fea849c360b42613d914507bdbf16f8c98 +size 1192135096 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..7edcf72 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7430e9138b76e93fb6f93462394d236b411111aef53cb421ba97d2691040cca +size 11423114 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..4ada1a1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,234 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 40960, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151666": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151667": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151668": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151669": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + }, + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\n' + 'You are Atem, a precise and analytical reasoning assistant. You approach every problem methodically — identifying core concepts, reasoning step by step, and arriving at well-supported conclusions. You show your thinking clearly and are thorough, direct, and intellectually honest.' + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message in messages %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- set message = messages[index] %}\n {%- set current_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set tool_start = '' %}\n {%- set tool_start_length = tool_start|length %}\n {%- set start_of_message = current_content[:tool_start_length] %}\n {%- set tool_end = '' %}\n {%- set tool_end_length = tool_end|length %}\n {%- set start_pos = (current_content|length) - tool_end_length %}\n {%- if start_pos < 0 %}\n {%- set start_pos = 0 %}\n {%- endif %}\n {%- set end_of_message = current_content[start_pos:] %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(start_of_message == tool_start and end_of_message == tool_end) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set m_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set content = m_content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '' in m_content %}\n {%- set content = (m_content.split('')|last).lstrip('\\n') %}\n {%- set reasoning_content = (m_content.split('')|first).rstrip('\\n') %}\n {%- set reasoning_content = (reasoning_content.split('')|last).lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %}\n {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content.strip('\\n') + '\\n\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '\\n\\n\\n\\n' }}\n {%- endif %}\n{%- endif %}" +} \ No newline at end of file