commit 4a10f41912edb74f8ca31a36b2e5ee4345dca1ee Author: ModelHub XC Date: Sat May 2 02:01:09 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: EphAsad/Aristaeus Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f2849b3 --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +--- +language: +- en +license: apache-2.0 +base_model: Qwen/Qwen2.5-1.5B-Instruct +tags: +- reasoning +- fine-tuned +- qwen2.5 +- math +- science +- code +- chain-of-thought +- unsloth +datasets: +- open-thoughts/OpenThoughts3-1.2M +- bespokelabs/Bespoke-Stratos-17k +pipeline_tag: text-generation +--- + +# Aristaeus + +**Aristaeus** is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), trained to improve structured, step-by-step reasoning across mathematics, science, logic, and code. It is a Stage 1 reasoning model — the goal of this release is deliberate, verifiable chain-of-thought, not raw benchmark maximisation. + +The name comes from Aristaeus, the ancient Greek deity of practical knowledge — beekeeping, olive cultivation, cheesemaking. Applied intelligence in service of real things. + +--- + +## Training + +| Detail | Value | +|---|---| +| Base model | Qwen/Qwen2.5-1.5B-Instruct | +| Fine-tune type | Full fine-tune (bf16) | +| Hardware | NVIDIA A100-SXM4-40GB | +| Training time | ~81 minutes | +| Epochs | 2 | +| Sequence length | 4096 tokens | +| Effective batch size | 16 (batch 2 × grad accum 8) | +| Learning rate | 2e-5 (cosine schedule) | +| Warmup ratio | 0.05 | +| Framework | Unsloth + TRL SFTTrainer | +| Final train loss | 1.083 | +| Final eval loss | 1.023 | + +### Datasets + +**[open-thoughts/OpenThoughts3-1.2M](https://huggingface.co/datasets/open-thoughts/OpenThoughts3-1.2M)** — 30,000 examples sampled via streaming. Reasoning traces generated by QwQ-32B (Apache 2.0). Covers mathematics, science, and coding problems with long chain-of-thought traces. + +**[bespokelabs/Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k)** — Full 16,710 examples. Curated from AIME/MATH olympiad problems, competitive programming (APPS, TACO), and science/puzzle data. Reasoning traces generated from DeepSeek-R1 via local inference. + +Combined training set: ~47,000 examples after normalisation and filtering. Both datasets were selected for clean licensing (no API-generated outputs from closed models). + +--- + +## Evaluation + +Aristaeus was compared against the base Qwen2.5-1.5B-Instruct across six reasoning tasks covering different problem types. Results below are from manual evaluation — no automated benchmark harness was used for this release. + +| Task | Aristaeus | Base | +|---|---|---| +| Unit conversion (train speed km → m/s) | ✅ Correct | ❌ Wrong (unit tracking failure) | +| Multi-step word problem (apples) | ✅ Correct | ✅ Correct | +| Deductive logic (mammals/warm-blooded) | ⚠️ Correct answer, minor overreach | ✅ Correct, richer detail | +| Recursive code trace (Fibonacci f(7)) | ❌ Lost thread, no answer | ✅ Correct (13) | +| Exponential growth (bacterial doubling) | ✅ Correct (6400) | ✅ Correct (6400) | +| Spatial constraint reasoning (water jug) | ✅ Correct, includes verification | ❌ Incoherent final steps | + +**3 wins / 1 loss / 2 draws** against base on this task set. + +### Honest limitations + +**Recursive call stack tracing** is the clearest failure mode. On `f(7)` Fibonacci, Aristaeus lost track of the recursion depth, began questioning its own assumptions, and produced no final answer. The base model handled it correctly. This is consistent with a known capacity ceiling at 1.5B parameters for problems that require holding many simultaneous state variables. A 7B model would likely not exhibit this failure. + +**Logical overconfidence** was observed on the deductive reasoning prompt. The model correctly concluded dolphins are warm-blooded, but also asserted snakes are cold-blooded purely from the premise "snakes are not mammals" — which does not logically follow without additional premises. The model has learned to produce confident, structured conclusions, which occasionally leads it to state more than the premises support. This is a known SFT artefact when training data rewards assertive, well-formatted responses. + +The eval loss curve plateaued convincingly from step ~2800 onward, suggesting the model saturated the current dataset. Additional epochs would not improve this release. + +--- + +## Usage + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("EphAsad/Aristaeus") +tokenizer = AutoTokenizer.from_pretrained("EphAsad/Aristaeus") + +messages = [ + {"role": "system", "content": "You are a helpful reasoning assistant."}, + {"role": "user", "content": "A bacterial culture starts with 100 cells and doubles every 20 minutes. How many cells after 2 hours?"}, +] + +text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +inputs = tokenizer(text, return_tensors="pt").to(model.device) + +output = model.generate(**inputs, max_new_tokens=1024, temperature=0.6, top_p=0.9, do_sample=True) +print(tokenizer.decode(output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)) +``` + +--- + +## Roadmap + +Aristaeus is a Stage 1 release. Two further stages are planned: + +**Stage 2 — Agentic tool use.** Fine-tuning on `lambda/hermes-agent-reasoning-traces` (Apache 2.0, agentic trajectories with `` blocks and real tool execution results) at 16k context. The intention is to teach the model *when* and *how* to use tools, layered on top of the reasoning foundation established here. + + +--- + +## Author + +Built by **Zain Asad** (Eph) — Senior Microbiology Analyst and Applied AI Engineer. + +Core portfolio: [BactAID](https://doi.org/10.5281/zenodo.18089381) · [DomainEmbedder](https://huggingface.co/EphAsad/DomainEmbedder) · FireSOP · FireAccess LIMS · Eidos · Ananke + +--- + +## Licence + +Apache 2.0 — consistent with the base model and training datasets used. \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..642e597 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,53 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/config.json b/config.json new file mode 100644 index 0000000..753a593 --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "pad_token_id": 151665, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000.0, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.0", + "unsloth_fixed": true, + "unsloth_version": "2026.4.8", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..5f32a68 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "max_length": 32768, + "pad_token_id": 151665, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..a90d0ad --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97d0af2b393e4be8f1f16b455df4cd598658db1bb6859243bb564120b6738b9 +size 3087467144 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..5340d81 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5948af71b4f56cf697f7580814c7ce8b80595ef985544efcacf716126a2e31 +size 11422356 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..c0a1437 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,201 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "is_local": false, + "model_max_length": 32768, + "pad_token": "<|PAD_TOKEN|>", + "padding_side": "left", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "151657": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151658": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": false + }, + "151665": { + "content": "<|PAD_TOKEN|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..41f8e10 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f042a837591be9505e6f807b4babed7a019cff4f42a4f730eeda2a26b822f068 +size 5649