初始化项目，由ModelHub XC社区提供模型

Model: laion/Sera-4.5A-Full-T1-v3-316-axolotl__Qwen3-8B Source: Original Platform
2026-06-16 06:04:17 +08:00
commit ad612efc77
10 changed files with 2992 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,133 @@
+---
+library_name: transformers
+base_model: Qwen/Qwen3-8B
+tags:
+- generated_from_trainer
+datasets:
+- laion/Sera-4.5A-Full-T1-v3-316
+model-index:
+- name: e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+
+axolotl version: `0.16.0.dev0`
+```yaml
+# Mirror of SERA's axolotl_qwen3_8b.yaml, parameterized for Jupiter SFT on v3 data.
+# Filled at render time via sed-substitution of 316.
+
+base_model: Qwen/Qwen3-8B
+deepspeed: /e/scratch/jureap59/feuer1/code/axolotl/deepspeed_configs/zero3_bf16.json
+
+load_in_8bit: false
+load_in_4bit: false
+
+# plugins disabled 2026-04-22: CCE + bf16 + flash-attn on aarch64/torch2.9 caused
+# gradient explosion (grad_norm 9.8e+11) and loss -> 0 within first 3-7 steps.
+# plugins:
+#   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+chat_template: chatml
+datasets:
+  - path: laion/Sera-4.5A-Full-T1-v3-316
+    data_files:
+      - sera-4.5a-full-t1_v3_316.jsonl
+    type: chat_template
+    field_messages: messages
+    ds_type: json
+    message_field_training: train
+
+dataset_prepared_path: /e/data1/datasets/playground/ot-baf/axolotl_dataset_cache/sera-v3-316
+output_dir: /e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B
+
+sequence_len: 32768
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name: sera-v3-316-axolotl__Qwen3-8B
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 1e-5
+adam_beta1: 0.9
+adam_beta2: 0.95
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+activation_offloading: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_ratio: 0.1875
+evals_per_epoch: 0
+save_strategy: epoch
+
+weight_decay: 0.01
+max_grad_norm: 1.0
+special_tokens:
+
+```
+
+</details><br>
+
+# e/data1/datasets/playground/ot-baf/checkpoints/sera-v3-316-axolotl__Qwen3-8B
+
+This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the laion/Sera-4.5A-Full-T1-v3-316 dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 32
+- total_eval_batch_size: 4
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 2
+- training_steps: 10
+
+### Training results
+
+
+
+### Framework versions
+
+- Transformers 5.5.0
+- Pytorch 2.9.1+cu130
+- Datasets 4.5.0
+- Tokenizers 0.22.2
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,4 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,71 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
--- a/debug.log
+++ b/debug.log
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,12 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.5.0"
+}
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96acbf1452bc4a83a778a3ce60b5994ccad95aad6ce9fbc2db713c0545be33b4
+size 16381517176
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32d35c4f09479b0b6c555c83f0f79bfd301bbe53325dc6da828d6a32a18a86a5
+size 8849