commit a1988ac48a36b6c59c0c07e0a594d4cf9497e5a4 Author: ModelHub XC Date: Thu May 28 23:56:19 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Columbia-NLP/LION-Gemma-2b-odpo-v1.0 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..dd5357a --- /dev/null +++ b/README.md @@ -0,0 +1,108 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for LION-Gemma-2b-odpo-v1.0 + +The LION-series are trained using an **empirically optimized pipeline** that consists of three stages: SFT, DPO, and online preference learning (online DPO). We find simple techniques such as sequence packing, loss masking in SFT, increasing the preference dataset size in DPO, and online DPO training can significantly improve the performance of language models. Our best models (the LION-series) **exceed the performance of the official instruct models** tuned with closed-source data and algorithms. + +For training datasets, code, and evaluation scripts, please refer to our [paper](https://arxiv.org/abs/2407.06542) and [codebase](https://github.com/Columbia-NLP-Lab/LionAlignment). + + +## Model description + +This model is finetuned from [`Columbia-NLP/LION-Gemma-2b-dpo-v1.0`](https://huggingface.co/Columbia-NLP/LION-Gemma-2b-dpo-v1.0) using online DPO from the LION pipeline. + +- **Model type:** [`gemma-2b`](https://huggingface.co/google/gemma-2b) +- **Language(s) (NLP):** Primarily English +- **License:** Gemma Terms of Use +- **Finetuned from model:** [`Columbia-NLP/LION-Gemma-2b-dpo-v1.0`](https://huggingface.co/Columbia-NLP/LION-Gemma-2b-dpo-v1.0) + + +## Performance + +| Model | Method | Size | Arena-Hard | AlpacaEval-2 | MT-Bench | OpenLLM | +|-------------|--------|------|------:|------:|---------:|-------:| +|[Gemma-2b](https://huggingface.co/google/gemma-2b) | - | 2B | - | - | - | 46.69 | +|[Gemma-2b-it](https://huggingface.co/google/gemma-2b-it) | SFT+RLHF | 2B | 3.4 | 5.44 | 5.63 | 42.75 | +|[Gemma-2b-zephyr](https://huggingface.co/wandb/gemma-2b-zephyr-dpo) | SFT+DPO | 2B | 0.9 | 2.65 | 4.13 | 46.92 | +|[LLaMA-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | SFT | 7B | 4.6 | 5.35 | 6.22 | 53.16 | +|[Vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | SFT | 7B | 2.5 | 7.62 | 6.57 | 52.06 | +|[LION-Gemma-2b-sft-v1.0 (ours)](https://huggingface.co/Columbia-NLP/LION-Gemma-2b-sft-v1.0) | SFT | 2B | 2.4 | 7.79 | 6.37 | 54.78 | +|[LION-Gemma-2b-dpo-v1.0 (ours)](https://huggingface.co/Columbia-NLP/LION-Gemma-2b-dpo-v1.0) | SFT+DPO | 2B | 4.6 | 8.75 | 6.58 | 55.35 | +|⮕ [LION-Gemma-2b-odpo-v1.0 (ours)](https://huggingface.co/Columbia-NLP/LION-Gemma-2b-odpo-v1.0) | SFT+DPO+ODPO | 2B | 5.0 | 9.57 | 6.75 | 55.98 | + + +## Intended uses + +To ensure reproducibility, please use the following chat templates: + +```python +import torch +from transformers import pipeline + +pipe = pipeline( + "text-generation", + model="Columbia-NLP/LION-Gemma-2b-odpo-v1.0", + device_map="auto", + torch_dtype=torch.bfloat16, +) +messages = [ + { + "role": "system", + "content": "", + }, + { + "role": "user", + "content": "Write a short paragraph where every sentence start with the letter A." + }, +] +outputs = pipe( + messages, + max_new_tokens=128, + do_sample=True, + temperature=0.7, + top_p=0.7, + stop_sequence="<|im_end|>", +) +print(outputs[0]["generated_text"][-1]["content"]) +# Alice always aspired to acquire ample adventure. +# Astonishingly, amid abundant allurements, Alice allocated ample attention to each activity, ensuring an array of adventures. +# Albeit anxieties arose, Alice assuaged them with affirmations, ardently advancing ambitiously towards an array of adventures. +``` + +to inspect the chat template/manually do generation: + +```python +tokenizer = AutoTokenizer.from_pretrained("Columbia-NLP/LION-Gemma-2b-odpo-v1.0") +prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +print(prompt) +# tokenize prompt and use model.generate +``` + + +### Training details + +Please refer to our [paper](https://arxiv.org/abs/2407.06542) and [codebase](https://github.com/Columbia-NLP-Lab/LionAlignment). + + +## Citation Information + +If you find this model useful in your work, please consider citing our paper: + +``` +@misc{yu2024lionsempiricallyoptimizedapproach, + title={LIONs: An Empirically Optimized Approach to Align Language Models}, + author={Xiao Yu and Qingyang Wu and Yu Li and Zhou Yu}, + year={2024}, + eprint={2407.06542}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2407.06542}, +} +``` + +## Acknowledgements + +We thank the Columbia-NLP group and [articulate.ai](https://www.articulateai.com/) for providing OpenAI API credits and computational resources to conduct our experiments. diff --git a/config.json b/config.json new file mode 100644 index 0000000..c90b3c5 --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "../model_checkpoints_coffee/oil/lion-gemma-v0.1/gemma-2b-lion-v0.6-165k-2epoch-online-60k-bigmix-beta0.05-epoch1-bsz64-zero1", + "architectures": [ + "GemmaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 2, + "eos_token_id": 1, + "head_dim": 256, + "hidden_act": "gelu", + "hidden_activation": null, + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 8192, + "model_type": "gemma", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "pad_token_id": 0, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "torch_dtype": "float32", + "transformers_version": "4.39.1", + "use_cache": false, + "vocab_size": 256000 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..e006521 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.39.1", + "use_cache": false +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000..e692edc --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9bb09bd06490787ae2ef476a8d241a43b884503ec1a651819658e62b8c37e0 +size 4911635192 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000..acc2a69 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcfbff053edb3288072d5eac12f4b179e4daeb60d5c230c9e8b479a0ce26a768 +size 4978830584 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000..2c6ac65 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb7830d254efd1ed4ca015a94880aba249eb980123190d1aa182369bb7a7c1e +size 134242760 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..83cce8a --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,171 @@ +{ + "metadata": { + "total_size": 10024689664 + }, + "weight_map": { + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..fa665d5 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,34 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..4486ab3 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22449cb9ef4bad0db7dd93b46ddff7ab7d6a654dd4f903e130ddb6361eac3af5 +size 17477473 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..90bc33e --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,70 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "106": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>" + ], + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 8192, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +}