初始化项目,由ModelHub XC社区提供模型
Model: EphAsad/Atem-0.6B Source: Original Platform
This commit is contained in:
43
.gitattributes
vendored
Normal file
43
.gitattributes
vendored
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
|
qwen3-0.6b.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
qwen3-0.6b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
qwen3-0.6b.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Atem-0.6b.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Atem-0.6b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Atem-0.6b.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Logo.png filter=lfs diff=lfs merge=lfs -text
|
||||||
3
Atem-0.6b.Q4_K_M.gguf
Normal file
3
Atem-0.6b.Q4_K_M.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ddef2fb68fbf63a4ac00716aad48647f5da3249513b771cb8a32a12c5b9202e1
|
||||||
|
size 396705696
|
||||||
3
Atem-0.6b.Q5_K_M.gguf
Normal file
3
Atem-0.6b.Q5_K_M.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9dae12e3543d08aa81ab2244d54226b4a6afc713682a0ced185b2474a9dd9354
|
||||||
|
size 444415904
|
||||||
3
Atem-0.6b.Q8_0.gguf
Normal file
3
Atem-0.6b.Q8_0.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:75c17322b60c7e36d17149967844fe53881f9720e7125a8a2a58048c84ab8221
|
||||||
|
size 639447968
|
||||||
3
Logo.png
Normal file
3
Logo.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:036d268ac79d1a5355a5ea602fc02ba312f4938c3506ddfa148ec7de44b69607
|
||||||
|
size 981796
|
||||||
59
Modelfile
Normal file
59
Modelfile
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
|
||||||
|
FROM qwen3-0.6b.Q8_0.gguf
|
||||||
|
TEMPLATE """{{- if .Messages }}
|
||||||
|
{{- if or .System .Tools }}<|im_start|>system
|
||||||
|
{{- if .System }}
|
||||||
|
{{ .System }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Tools }}
|
||||||
|
|
||||||
|
# Tools
|
||||||
|
|
||||||
|
You may call one or more functions to assist with the user query.
|
||||||
|
|
||||||
|
You are provided with function signatures within <tools></tools> XML tags:
|
||||||
|
<tools>
|
||||||
|
{{- range .Tools }}
|
||||||
|
{"type": "function", "function": {{ .Function }}}
|
||||||
|
{{- end }}
|
||||||
|
</tools>
|
||||||
|
|
||||||
|
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||||
|
<tool_call>
|
||||||
|
{"name": <function-name>, "arguments": <args-json-object>}
|
||||||
|
</tool_call>
|
||||||
|
{{- end }}<|im_end|>
|
||||||
|
{{ end }}
|
||||||
|
{{- range $i, $_ := .Messages }}
|
||||||
|
{{- $last := eq (len (slice $.Messages $i)) 1 -}}
|
||||||
|
{{- if eq .Role "user" }}<|im_start|>user
|
||||||
|
{{ .Content }}<|im_end|>
|
||||||
|
{{ else if eq .Role "assistant" }}<|im_start|>assistant
|
||||||
|
{{ if .Content }}{{ .Content }}
|
||||||
|
{{- else if .ToolCalls }}<tool_call>
|
||||||
|
{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
|
||||||
|
{{ end }}</tool_call>
|
||||||
|
{{- end }}{{ if not $last }}<|im_end|>
|
||||||
|
{{ end }}
|
||||||
|
{{- else if eq .Role "tool" }}<|im_start|>user
|
||||||
|
<tool_response>
|
||||||
|
{{ .Content }}
|
||||||
|
</tool_response><|im_end|>
|
||||||
|
{{ end }}
|
||||||
|
{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
|
||||||
|
{{ end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- else }}
|
||||||
|
{{- if .System }}<|im_start|>system
|
||||||
|
{{ .System }}<|im_end|>
|
||||||
|
{{ end }}{{ if .Prompt }}<|im_start|>user
|
||||||
|
{{ .Prompt }}<|im_end|>
|
||||||
|
{{ end }}<|im_start|>assistant
|
||||||
|
{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
|
||||||
|
PARAMETER stop "<|im_end|>"
|
||||||
|
PARAMETER stop "<|im_start|>"
|
||||||
|
PARAMETER temperature 0.6
|
||||||
|
PARAMETER min_p 0.0
|
||||||
|
PARAMETER top_k 20
|
||||||
|
PARAMETER top_p 0.95
|
||||||
|
PARAMETER repeat_penalty 1
|
||||||
358
README.md
Normal file
358
README.md
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
---
|
||||||
|
license: apache-2.0
|
||||||
|
base_model: Qwen/Qwen3-0.6B
|
||||||
|
tags:
|
||||||
|
- unsloth
|
||||||
|
- lora
|
||||||
|
- qwen3
|
||||||
|
- reasoning
|
||||||
|
- distillation
|
||||||
|
- conversational
|
||||||
|
datasets:
|
||||||
|
- EphAsad/QWENMillenium-SF
|
||||||
|
- EphAsad/Phi4Millennium-SF
|
||||||
|
- EphAsad/MistralMillenium-SF
|
||||||
|
- Modotte/CodeX-2M-Thinking
|
||||||
|
- Jackrong/Kimi-K2.5-Reasoning-1M-Cleaned
|
||||||
|
- WithinUsAI/MiniMax_M2.7_Distilled_5k
|
||||||
|
- tuanha1305/DeepSeek-R1-Distill
|
||||||
|
- open-r1/OpenThoughts-114k-math
|
||||||
|
- flytech/python-codes-25k
|
||||||
|
- FreedomIntelligence/medical-o1-reasoning-SFT
|
||||||
|
- Jackrong/Claude-opus-4.7-TraceInversion-5000x
|
||||||
|
language:
|
||||||
|
- en
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
library_name: transformers
|
||||||
|
---
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
# Atem-0.6B
|
||||||
|
|
||||||
|
*Ancient logic. Modern intelligence.*
|
||||||
|
|
||||||
|
A 0.6B reasoning model trained via multi-source knowledge distillation from frontier teacher models.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Atem-0.6B is a 0.6B parameter reasoning model built via supervised fine-tuning on a curated corpus of approximately 120,000 examples distilled from multiple frontier teacher models. Starting from Qwen/Qwen3-0.6B, Atem was trained using LoRA to preserve base model capabilities while shifting output style toward clean, directly-formatted final answers.
|
||||||
|
|
||||||
|
This is **Stage 1** of a planned multi-stage training series, and the first entry in the Atem family built on Qwen3 rather than Qwen2.5. Stage 1 strips `<think>` reasoning traces from all training data, deliberately suppressing Qwen3's native exposed chain-of-thought in favor of direct answers. **Stage 2 (Atem-Savant-0.6B) is currently in progress**, layering curated chain-of-thought traces back on top of this foundation — see [Known Limitations](#known-limitations) for why that stage matters.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Details
|
||||||
|
|
||||||
|
| Property | Value |
|
||||||
|
| ------------------------ | ---------------------------------------- |
|
||||||
|
| **Base model** | Qwen/Qwen3-0.6B |
|
||||||
|
| **Training method** | LoRA Supervised Fine-Tuning (Stage 1) |
|
||||||
|
| **LoRA config** | r=32, alpha=64, dropout=0.05 |
|
||||||
|
| **Target modules** | q, k, v, o, gate, up, down projections |
|
||||||
|
| **Parameters** | ~596M |
|
||||||
|
| **Trainable (LoRA) params** | 20,185,088 (3.28% of base) |
|
||||||
|
| **Training records** | 120,017 |
|
||||||
|
| **Epochs** | 2 |
|
||||||
|
| **Effective batch size** | 128 (batch 32 × grad accum 4) |
|
||||||
|
| **Learning rate** | 2e-4, cosine schedule, 5% warmup |
|
||||||
|
| **Final train loss** | 1.055 |
|
||||||
|
| **Final val loss** | 1.073 |
|
||||||
|
| **Hardware** | NVIDIA A100-SXM4 80GB |
|
||||||
|
| **Max sequence length** | 4,096 tokens |
|
||||||
|
| **Precision** | bfloat16 |
|
||||||
|
| **License** | Apache 2.0 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Intended Use
|
||||||
|
|
||||||
|
Atem-0.6B is designed for lightweight, open-ended reasoning tasks where structured, direct answers add value at low compute cost:
|
||||||
|
|
||||||
|
- Code explanation, implementation, and debugging
|
||||||
|
- Mathematical problem solving with working shown
|
||||||
|
- Analytical reasoning and hypothesis evaluation
|
||||||
|
- Concept explanation and comparative analysis
|
||||||
|
- Logic, argument, and fallacy identification
|
||||||
|
|
||||||
|
Atem-0.6B is **not** designed for retrieval-heavy factual lookup, real-time information, or tasks requiring broad knowledge breadth beyond its training domains. At 0.6B parameters its capability ceiling is naturally lower than larger Atem models — expect it to be most useful where speed and footprint matter more than depth on hard, multi-step problems.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Training Data
|
||||||
|
|
||||||
|
Atem-0.6B was trained on a corpus assembled from eleven sources, combining domain-specific generated datasets and publicly available distillation datasets from frontier models. All outputs containing `<think>` reasoning traces were stripped to clean final responses for Stage 1 training.
|
||||||
|
|
||||||
|
| Dataset | Records | Source / Teacher |
|
||||||
|
| --------------------------------------------- | ------------ | ----------------------------------------------------- |
|
||||||
|
| EphAsad/QWENMillenium-SF | ~ | Qwen2.5-14B — Analytical & Scientific |
|
||||||
|
| EphAsad/Phi4Millennium-SF | ~ | Phi-4 14B — Mathematical Reasoning |
|
||||||
|
| EphAsad/MistralMillenium-SF | ~ | Mistral-Nemo-12B — Language & Comprehension |
|
||||||
|
| Modotte/CodeX-2M-Thinking | 40,000 | Mixed — Coding |
|
||||||
|
| Jackrong/Kimi-K2.5-Reasoning-1M-Cleaned | 23,000 | Kimi K2.5 — General Distillation (English filtered) |
|
||||||
|
| WithinUsAI/MiniMax_M2.7_Distilled_5k | 5,000 | MiniMax M2.7 |
|
||||||
|
| tuanha1305/DeepSeek-R1-Distill | 9,000 | DeepSeek-R1 |
|
||||||
|
| open-r1/OpenThoughts-114k-math | 10,000 | Mixed — Mathematics (correct answers only) |
|
||||||
|
| flytech/python-codes-25k | 10,000 | Python coding |
|
||||||
|
| FreedomIntelligence/medical-o1-reasoning-SFT | 10,000 | Medical reasoning (English config) |
|
||||||
|
| Jackrong/Claude-opus-4.7-TraceInversion-5000x | 5,000 | Claude Opus 4.7 — Trace Inversion |
|
||||||
|
| **Total** | **120,017** | |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Training Configuration
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Key hyperparameters
|
||||||
|
lora_r = 32
|
||||||
|
lora_alpha = 64
|
||||||
|
lora_dropout = 0.05
|
||||||
|
max_seq_length = 4096
|
||||||
|
learning_rate = 2e-4
|
||||||
|
lr_scheduler = 'cosine'
|
||||||
|
warmup_ratio = 0.05
|
||||||
|
batch_size = 32
|
||||||
|
grad_accumulation = 4 # effective batch size: 128
|
||||||
|
num_epochs = 2
|
||||||
|
dtype = bfloat16
|
||||||
|
load_in_4bit = True # during training
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Training used Unsloth with `train_on_responses_only` masking, ensuring loss was computed exclusively on assistant response tokens. Because Qwen3 ships with no default system prompt (unlike Qwen2.5-Instruct), Atem's identity is baked in via a chat-template modification that injects Atem as the default persona only when no explicit system message is supplied — explicit system messages still take priority. A pre-training validation suite verified this injection, confirmed the response-masking boundary correctly accounts for Qwen3's automatic empty `<think></think>` scaffold insertion, and checked for leaked reasoning content before training began.
|
||||||
|
|
||||||
|
After training, LoRA adapters were merged into the base weights and exported as a full merged model.
|
||||||
|
|
||||||
|
**Loss curve:**
|
||||||
|
|
||||||
|
| Step | Train Loss | Val Loss |
|
||||||
|
| ----- | ---------- | --------- |
|
||||||
|
| 200 | 1.166 | 1.163 |
|
||||||
|
| 800 | 1.108 | 1.096 |
|
||||||
|
| 1400 | 0.983 | 1.077 |
|
||||||
|
| Final (1876) | **1.055** | **1.073** |
|
||||||
|
|
||||||
|
Validation loss plateaued around step 1600 of 1876 total steps — the final ~15% of training produced only marginal further improvement (1.074 → 1.073). Train loss showed some batch-to-batch volatility late in training (a step-1800 spike to 1.088, consistent with the dataset's domain diversity rather than divergence), but validation loss stayed smooth and never reversed, indicating no overfitting across the two epochs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
|
||||||
|
### Benchmark Results
|
||||||
|
|
||||||
|
Evaluated against the base model (`unsloth/qwen3-0.6b-unsloth-bnb-4bit`) using lm-evaluation-harness.
|
||||||
|
|
||||||
|
| Task | Base (Qwen3-0.6B) | Atem-0.6B | Delta |
|
||||||
|
| ------------------------ | ------------------ | ---------- | ----------- |
|
||||||
|
| ARC-Challenge (0-shot, acc_norm) | 33.0% | 35.0% | +2.0% ✓ |
|
||||||
|
| GSM8K (5-shot, strict-match) | 26.7% | **31.8%** | **+5.1%** ✓ |
|
||||||
|
| HellaSwag (0-shot, acc_norm) | 45.3% | 45.8% | +0.5% |
|
||||||
|
|
||||||
|
**Eval condition note:** the base model was loaded in 4-bit (`unsloth/qwen3-0.6b-unsloth-bnb-4bit`); Atem-0.6B was evaluated as the full bfloat16 merged model. This is not a precision-matched comparison — the gap may be modestly inflated relative to a 4-bit-vs-4-bit or bf16-vs-bf16 run. GSM8K used 5-shot prompting per lm-eval's default config; ARC-Challenge and HellaSwag were 0-shot.
|
||||||
|
|
||||||
|
The GSM8K gain is the standout figure, but it likely reflects Stage 1's training toward clean, directly-formatted final answers — which matters a great deal for lm-eval's exact-match-on-extracted-number scoring — more than a deeper improvement in multi-step mathematical reasoning. The qualitative evaluation below, which looks at harder, less templated problems, supports this reading: reasoning depth on multi-step problems is not uniformly better than the base model. ARC-Challenge and HellaSwag, which probe general/commonsense knowledge rather than output formatting, moved only slightly — expected, since SFT on this corpus isn't designed to add new general knowledge.
|
||||||
|
|
||||||
|
### Qualitative Evaluation
|
||||||
|
|
||||||
|
Atem-0.6B was evaluated against base Qwen3-0.6B (default thinking-enabled) across 30 domain-representative questions with matched system prompts.
|
||||||
|
|
||||||
|
| Domain | Questions | Outcome |
|
||||||
|
| --------------------- | --------- | ----------------------------------------------------------------------------------------- |
|
||||||
|
| Coding | 8 | Mixed — comparable correctness; Atem notably more concise and direct |
|
||||||
|
| Mathematics | 6 | Mixed — base model's exposed reasoning self-corrects mid-generation on some multi-step problems that Atem commits to an error on |
|
||||||
|
| Analytical Reasoning | 6 | Base model edges ahead — exposed reasoning gives more room to work through multi-step arguments |
|
||||||
|
| General Knowledge | 5 | Comparable |
|
||||||
|
| Language & Logic | 5 | Comparable, slight edge to base on illustrative examples |
|
||||||
|
|
||||||
|
Atem-0.6B's outputs were consistently more concise and directly formatted — a direct result of Stage 1's design goal of suppressing exposed chain-of-thought. This did **not** translate into a uniform quality advantage over the base model: on problems requiring several sequential reasoning steps, the base model's visible thinking trace sometimes catches and corrects mistakes mid-generation that Atem, having no scratchpad, does not. This is the expected cost of the no-think format rather than a knowledge regression, and is the explicit target of the in-progress Stage 2 (Atem-Savant-0.6B) training, which reintroduces chain-of-thought.
|
||||||
|
|
||||||
|
One output during qualitative testing showed repetitive/degenerate text (a duplicated bullet list) on a single open-ended analytical question — noted here for transparency rather than treated as representative.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Transformers
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model_name = "EphAsad/Atem-0.6B"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write a Python function that checks whether a number is prime."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(model.device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=inputs,
|
||||||
|
max_new_tokens=1000,
|
||||||
|
temperature=0.7,
|
||||||
|
top_p=0.9,
|
||||||
|
repetition_penalty=1.1,
|
||||||
|
do_sample=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = tokenizer.decode(
|
||||||
|
output[0][inputs.shape[1]:],
|
||||||
|
skip_special_tokens=True
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Unsloth (faster inference)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name="EphAsad/Atem-0.6B",
|
||||||
|
max_seq_length=4096,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
load_in_4bit=True,
|
||||||
|
)
|
||||||
|
FastLanguageModel.for_inference(model)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Explain the difference between a stack and a queue, with examples."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to("cuda")
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=inputs,
|
||||||
|
max_new_tokens=1000,
|
||||||
|
temperature=0.7,
|
||||||
|
top_p=0.9,
|
||||||
|
do_sample=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(tokenizer.decode(
|
||||||
|
output[0][inputs.shape[1]:],
|
||||||
|
skip_special_tokens=True
|
||||||
|
))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ollama
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recommended — best speed/quality balance
|
||||||
|
ollama run hf.co/EphAsad/Atem-0.6B:Q4_K_M
|
||||||
|
|
||||||
|
# Higher quality
|
||||||
|
ollama run hf.co/EphAsad/Atem-0.6B:Q5_K_M
|
||||||
|
|
||||||
|
# Near-lossless
|
||||||
|
ollama run hf.co/EphAsad/Atem-0.6B:Q8_0
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### llama.cpp
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-server -hf EphAsad/Atem-0.6B:Q4_K_M
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### System Prompt
|
||||||
|
|
||||||
|
Atem-0.6B's identity is baked into the chat template and activates automatically when no system message is provided. For manual override:
|
||||||
|
|
||||||
|
```
|
||||||
|
You are Atem, a precise and analytical reasoning assistant. You approach
|
||||||
|
every problem methodically — identifying core concepts, reasoning step by
|
||||||
|
step, and arriving at well-supported conclusions. You show your thinking
|
||||||
|
clearly and are thorough, direct, and intellectually honest.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Available Files
|
||||||
|
|
||||||
|
| File | Size | Description |
|
||||||
|
| --------------------------- | ---------- | ----------------------------------- |
|
||||||
|
| `model.safetensors` | ~1.2 GB | Full bfloat16 merged weights |
|
||||||
|
| `Atem-0.6b.Q4_K_M.gguf` | ~397 MB | 4-bit quantised — recommended |
|
||||||
|
| `Atem-0.6b.Q5_K_M.gguf` | ~444 MB | 5-bit quantised |
|
||||||
|
| `Atem-0.6b.Q8_0.gguf` | ~700 MB | 8-bit quantised — near-lossless |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Limitations
|
||||||
|
|
||||||
|
**No thinking traces (Stage 1 by design).** Think tags were stripped from all training data for Stage 1, and Qwen3's native exposed reasoning is suppressed. The model does not produce extended `<think>` content. As shown in the qualitative evaluation above, this measurably costs accuracy on multi-step analytical and mathematical problems relative to the base model's default thinking-enabled behavior — Stage 2 (Atem-Savant-0.6B, in progress) exists specifically to recover this.
|
||||||
|
|
||||||
|
**Smaller capability ceiling than larger Atem models.** At 0.6B parameters, this is the smallest model in the Atem family. Treat it as a fast, low-footprint option rather than a reasoning-depth flagship.
|
||||||
|
|
||||||
|
**Mathematical precision on complex problems.** On multi-step calculations, the model may make arithmetic or counting errors without a scratchpad to catch them — verified directly in qualitative testing (e.g., miscounting combinatorial outcomes). Answers to high-stakes mathematical problems should be independently verified.
|
||||||
|
|
||||||
|
**Eval precision asymmetry.** The benchmark comparison above evaluated the base model in 4-bit and Atem-0.6B in bfloat16 — see the Evaluation section for details. A precision-matched re-run would give a cleaner comparison.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Roadmap
|
||||||
|
|
||||||
|
Atem-0.6B establishes the Stage 1 foundation for the Qwen3-based branch of the Atem family. Planned next steps:
|
||||||
|
|
||||||
|
- **Stage 2 (in progress):** Atem-Savant-0.6B — LoRA SFT on curated chain-of-thought data (~90% think-trace records, ~10% no-think) using OpenR1-Math, Kimi-K2.5, DeepSeek-V4-Pro-Reasoning, OpenCodeReasoning, and trace-inversion datasets, to recover multi-step reasoning depth on top of Stage 1's direct-answer foundation
|
||||||
|
- **Extended benchmarks:** MMLU, BBH, IFEval post-Stage 2
|
||||||
|
- **Precision-matched re-benchmark:** re-run base vs Atem comparison under identical 4-bit (or identical bf16) conditions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@misc{atem_06b_2026,
|
||||||
|
author = {Asad, Zain},
|
||||||
|
title = {Atem-0.6B: A 0.6B Direct-Reasoning Model via
|
||||||
|
Stage 1 SFT on Qwen3},
|
||||||
|
year = {2026},
|
||||||
|
publisher = {HuggingFace},
|
||||||
|
howpublished = {\url{https://huggingface.co/EphAsad/Atem-0.6B}},
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Released under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0), consistent with the base model Qwen/Qwen3-0.6B.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Built independently by Zain Asad - [EphAsad](https://huggingface.co/EphAsad)
|
||||||
101
chat_template.jinja
Normal file
101
chat_template.jinja
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- messages[0].content + '\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>system\n' + 'You are Atem, a precise and analytical reasoning assistant. You approach every problem methodically — identifying core concepts, reasoning step by step, and arriving at well-supported conclusions. You show your thinking clearly and are thorough, direct, and intellectually honest.' + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||||
|
{%- for forward_message in messages %}
|
||||||
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||||
|
{%- set message = messages[index] %}
|
||||||
|
{%- set current_content = message.content if message.content is defined and message.content is not none else '' %}
|
||||||
|
{%- set tool_start = '<tool_response>' %}
|
||||||
|
{%- set tool_start_length = tool_start|length %}
|
||||||
|
{%- set start_of_message = current_content[:tool_start_length] %}
|
||||||
|
{%- set tool_end = '</tool_response>' %}
|
||||||
|
{%- set tool_end_length = tool_end|length %}
|
||||||
|
{%- set start_pos = (current_content|length) - tool_end_length %}
|
||||||
|
{%- if start_pos < 0 %}
|
||||||
|
{%- set start_pos = 0 %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set end_of_message = current_content[start_pos:] %}
|
||||||
|
{%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %}
|
||||||
|
{%- set ns.multi_step_tool = false %}
|
||||||
|
{%- set ns.last_query_index = index %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{%- set m_content = message.content if message.content is defined and message.content is not none else '' %}
|
||||||
|
{%- set content = m_content %}
|
||||||
|
{%- set reasoning_content = '' %}
|
||||||
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||||
|
{%- set reasoning_content = message.reasoning_content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- if '</think>' in m_content %}
|
||||||
|
{%- set content = (m_content.split('</think>')|last).lstrip('\n') %}
|
||||||
|
{%- set reasoning_content = (m_content.split('</think>')|first).rstrip('\n') %}
|
||||||
|
{%- set reasoning_content = (reasoning_content.split('<think>')|last).lstrip('\n') %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if loop.index0 > ns.last_query_index %}
|
||||||
|
{%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if (loop.first and content) or (not loop.first) %}
|
||||||
|
{{- '\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tool_call.function %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{%- if tool_call.arguments is string %}
|
||||||
|
{{- tool_call.arguments }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||||
|
{{- '<think>\n\n</think>\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
64
config.json
Normal file
64
config.json
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"Qwen3ForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": null,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"eos_token_id": 151645,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 1024,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 3072,
|
||||||
|
"layer_types": [
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention"
|
||||||
|
],
|
||||||
|
"max_position_embeddings": 40960,
|
||||||
|
"max_window_layers": 28,
|
||||||
|
"model_type": "qwen3",
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_hidden_layers": 28,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pad_token_id": 151669,
|
||||||
|
"rms_norm_eps": 1e-06,
|
||||||
|
"rope_parameters": {
|
||||||
|
"rope_theta": 1000000,
|
||||||
|
"rope_type": "default"
|
||||||
|
},
|
||||||
|
"sliding_window": null,
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"unsloth_fixed": true,
|
||||||
|
"unsloth_version": "2026.5.5",
|
||||||
|
"use_cache": false,
|
||||||
|
"use_sliding_window": false,
|
||||||
|
"vocab_size": 151936
|
||||||
|
}
|
||||||
13
generation_config.json
Normal file
13
generation_config.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": [
|
||||||
|
151645,
|
||||||
|
151643
|
||||||
|
],
|
||||||
|
"max_length": 40960,
|
||||||
|
"pad_token_id": 151669,
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_k": 20,
|
||||||
|
"top_p": 0.95,
|
||||||
|
"transformers_version": "5.5.0"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d7f0e3ed85d13b43971c3d1ed20e42fea849c360b42613d914507bdbf16f8c98
|
||||||
|
size 1192135096
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d7430e9138b76e93fb6f93462394d236b411111aef53cb421ba97d2691040cca
|
||||||
|
size 11423114
|
||||||
234
tokenizer_config.json
Normal file
234
tokenizer_config.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user