初始化项目,由ModelHub XC社区提供模型
Model: CultriX/MistralTrix-v1 Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
751
MistralTrix.ipynb
Normal file
751
MistralTrix.ipynb
Normal file
@@ -0,0 +1,751 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": [],
|
||||||
|
"machine_shape": "hm",
|
||||||
|
"gpuType": "A100"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
},
|
||||||
|
"widgets": {
|
||||||
|
"application/vnd.jupyter.widget-state+json": {
|
||||||
|
"22773c721a7c4221a9c14cd388461d4c": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "HBoxModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_dom_classes": [],
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "HBoxModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/controls",
|
||||||
|
"_view_module_version": "1.5.0",
|
||||||
|
"_view_name": "HBoxView",
|
||||||
|
"box_style": "",
|
||||||
|
"children": [
|
||||||
|
"IPY_MODEL_6b54841f5de1482694c360095dae3039",
|
||||||
|
"IPY_MODEL_448ccbc85e624ec3b3e71931a7ee4ff6",
|
||||||
|
"IPY_MODEL_173769f6f465485f8848a11bf269850b"
|
||||||
|
],
|
||||||
|
"layout": "IPY_MODEL_60978b9b4e8348f0a71ce3e35c73bcff"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"6b54841f5de1482694c360095dae3039": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "HTMLModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_dom_classes": [],
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "HTMLModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/controls",
|
||||||
|
"_view_module_version": "1.5.0",
|
||||||
|
"_view_name": "HTMLView",
|
||||||
|
"description": "",
|
||||||
|
"description_tooltip": null,
|
||||||
|
"layout": "IPY_MODEL_6a38dcbaf4674b448329ac0a16587d2a",
|
||||||
|
"placeholder": "",
|
||||||
|
"style": "IPY_MODEL_7eaeada2158e493189449af91f643553",
|
||||||
|
"value": "Loading checkpoint shards: 100%"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"448ccbc85e624ec3b3e71931a7ee4ff6": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "FloatProgressModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_dom_classes": [],
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "FloatProgressModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/controls",
|
||||||
|
"_view_module_version": "1.5.0",
|
||||||
|
"_view_name": "ProgressView",
|
||||||
|
"bar_style": "success",
|
||||||
|
"description": "",
|
||||||
|
"description_tooltip": null,
|
||||||
|
"layout": "IPY_MODEL_6e32854952b340008edca0139d3471d6",
|
||||||
|
"max": 3,
|
||||||
|
"min": 0,
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"style": "IPY_MODEL_db6d7cfcdade4b4baa213a5d0abc07d7",
|
||||||
|
"value": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"173769f6f465485f8848a11bf269850b": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "HTMLModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_dom_classes": [],
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "HTMLModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/controls",
|
||||||
|
"_view_module_version": "1.5.0",
|
||||||
|
"_view_name": "HTMLView",
|
||||||
|
"description": "",
|
||||||
|
"description_tooltip": null,
|
||||||
|
"layout": "IPY_MODEL_9083029642744c43b7705532cbe0cf79",
|
||||||
|
"placeholder": "",
|
||||||
|
"style": "IPY_MODEL_d028a98caa13425b907ceb513119006e",
|
||||||
|
"value": " 3/3 [00:11<00:00, 2.89s/it]"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"60978b9b4e8348f0a71ce3e35c73bcff": {
|
||||||
|
"model_module": "@jupyter-widgets/base",
|
||||||
|
"model_name": "LayoutModel",
|
||||||
|
"model_module_version": "1.2.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/base",
|
||||||
|
"_model_module_version": "1.2.0",
|
||||||
|
"_model_name": "LayoutModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "LayoutView",
|
||||||
|
"align_content": null,
|
||||||
|
"align_items": null,
|
||||||
|
"align_self": null,
|
||||||
|
"border": null,
|
||||||
|
"bottom": null,
|
||||||
|
"display": null,
|
||||||
|
"flex": null,
|
||||||
|
"flex_flow": null,
|
||||||
|
"grid_area": null,
|
||||||
|
"grid_auto_columns": null,
|
||||||
|
"grid_auto_flow": null,
|
||||||
|
"grid_auto_rows": null,
|
||||||
|
"grid_column": null,
|
||||||
|
"grid_gap": null,
|
||||||
|
"grid_row": null,
|
||||||
|
"grid_template_areas": null,
|
||||||
|
"grid_template_columns": null,
|
||||||
|
"grid_template_rows": null,
|
||||||
|
"height": null,
|
||||||
|
"justify_content": null,
|
||||||
|
"justify_items": null,
|
||||||
|
"left": null,
|
||||||
|
"margin": null,
|
||||||
|
"max_height": null,
|
||||||
|
"max_width": null,
|
||||||
|
"min_height": null,
|
||||||
|
"min_width": null,
|
||||||
|
"object_fit": null,
|
||||||
|
"object_position": null,
|
||||||
|
"order": null,
|
||||||
|
"overflow": null,
|
||||||
|
"overflow_x": null,
|
||||||
|
"overflow_y": null,
|
||||||
|
"padding": null,
|
||||||
|
"right": null,
|
||||||
|
"top": null,
|
||||||
|
"visibility": null,
|
||||||
|
"width": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"6a38dcbaf4674b448329ac0a16587d2a": {
|
||||||
|
"model_module": "@jupyter-widgets/base",
|
||||||
|
"model_name": "LayoutModel",
|
||||||
|
"model_module_version": "1.2.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/base",
|
||||||
|
"_model_module_version": "1.2.0",
|
||||||
|
"_model_name": "LayoutModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "LayoutView",
|
||||||
|
"align_content": null,
|
||||||
|
"align_items": null,
|
||||||
|
"align_self": null,
|
||||||
|
"border": null,
|
||||||
|
"bottom": null,
|
||||||
|
"display": null,
|
||||||
|
"flex": null,
|
||||||
|
"flex_flow": null,
|
||||||
|
"grid_area": null,
|
||||||
|
"grid_auto_columns": null,
|
||||||
|
"grid_auto_flow": null,
|
||||||
|
"grid_auto_rows": null,
|
||||||
|
"grid_column": null,
|
||||||
|
"grid_gap": null,
|
||||||
|
"grid_row": null,
|
||||||
|
"grid_template_areas": null,
|
||||||
|
"grid_template_columns": null,
|
||||||
|
"grid_template_rows": null,
|
||||||
|
"height": null,
|
||||||
|
"justify_content": null,
|
||||||
|
"justify_items": null,
|
||||||
|
"left": null,
|
||||||
|
"margin": null,
|
||||||
|
"max_height": null,
|
||||||
|
"max_width": null,
|
||||||
|
"min_height": null,
|
||||||
|
"min_width": null,
|
||||||
|
"object_fit": null,
|
||||||
|
"object_position": null,
|
||||||
|
"order": null,
|
||||||
|
"overflow": null,
|
||||||
|
"overflow_x": null,
|
||||||
|
"overflow_y": null,
|
||||||
|
"padding": null,
|
||||||
|
"right": null,
|
||||||
|
"top": null,
|
||||||
|
"visibility": null,
|
||||||
|
"width": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7eaeada2158e493189449af91f643553": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "DescriptionStyleModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "DescriptionStyleModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "StyleView",
|
||||||
|
"description_width": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"6e32854952b340008edca0139d3471d6": {
|
||||||
|
"model_module": "@jupyter-widgets/base",
|
||||||
|
"model_name": "LayoutModel",
|
||||||
|
"model_module_version": "1.2.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/base",
|
||||||
|
"_model_module_version": "1.2.0",
|
||||||
|
"_model_name": "LayoutModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "LayoutView",
|
||||||
|
"align_content": null,
|
||||||
|
"align_items": null,
|
||||||
|
"align_self": null,
|
||||||
|
"border": null,
|
||||||
|
"bottom": null,
|
||||||
|
"display": null,
|
||||||
|
"flex": null,
|
||||||
|
"flex_flow": null,
|
||||||
|
"grid_area": null,
|
||||||
|
"grid_auto_columns": null,
|
||||||
|
"grid_auto_flow": null,
|
||||||
|
"grid_auto_rows": null,
|
||||||
|
"grid_column": null,
|
||||||
|
"grid_gap": null,
|
||||||
|
"grid_row": null,
|
||||||
|
"grid_template_areas": null,
|
||||||
|
"grid_template_columns": null,
|
||||||
|
"grid_template_rows": null,
|
||||||
|
"height": null,
|
||||||
|
"justify_content": null,
|
||||||
|
"justify_items": null,
|
||||||
|
"left": null,
|
||||||
|
"margin": null,
|
||||||
|
"max_height": null,
|
||||||
|
"max_width": null,
|
||||||
|
"min_height": null,
|
||||||
|
"min_width": null,
|
||||||
|
"object_fit": null,
|
||||||
|
"object_position": null,
|
||||||
|
"order": null,
|
||||||
|
"overflow": null,
|
||||||
|
"overflow_x": null,
|
||||||
|
"overflow_y": null,
|
||||||
|
"padding": null,
|
||||||
|
"right": null,
|
||||||
|
"top": null,
|
||||||
|
"visibility": null,
|
||||||
|
"width": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"db6d7cfcdade4b4baa213a5d0abc07d7": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "ProgressStyleModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "ProgressStyleModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "StyleView",
|
||||||
|
"bar_color": null,
|
||||||
|
"description_width": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"9083029642744c43b7705532cbe0cf79": {
|
||||||
|
"model_module": "@jupyter-widgets/base",
|
||||||
|
"model_name": "LayoutModel",
|
||||||
|
"model_module_version": "1.2.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/base",
|
||||||
|
"_model_module_version": "1.2.0",
|
||||||
|
"_model_name": "LayoutModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "LayoutView",
|
||||||
|
"align_content": null,
|
||||||
|
"align_items": null,
|
||||||
|
"align_self": null,
|
||||||
|
"border": null,
|
||||||
|
"bottom": null,
|
||||||
|
"display": null,
|
||||||
|
"flex": null,
|
||||||
|
"flex_flow": null,
|
||||||
|
"grid_area": null,
|
||||||
|
"grid_auto_columns": null,
|
||||||
|
"grid_auto_flow": null,
|
||||||
|
"grid_auto_rows": null,
|
||||||
|
"grid_column": null,
|
||||||
|
"grid_gap": null,
|
||||||
|
"grid_row": null,
|
||||||
|
"grid_template_areas": null,
|
||||||
|
"grid_template_columns": null,
|
||||||
|
"grid_template_rows": null,
|
||||||
|
"height": null,
|
||||||
|
"justify_content": null,
|
||||||
|
"justify_items": null,
|
||||||
|
"left": null,
|
||||||
|
"margin": null,
|
||||||
|
"max_height": null,
|
||||||
|
"max_width": null,
|
||||||
|
"min_height": null,
|
||||||
|
"min_width": null,
|
||||||
|
"object_fit": null,
|
||||||
|
"object_position": null,
|
||||||
|
"order": null,
|
||||||
|
"overflow": null,
|
||||||
|
"overflow_x": null,
|
||||||
|
"overflow_y": null,
|
||||||
|
"padding": null,
|
||||||
|
"right": null,
|
||||||
|
"top": null,
|
||||||
|
"visibility": null,
|
||||||
|
"width": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"d028a98caa13425b907ceb513119006e": {
|
||||||
|
"model_module": "@jupyter-widgets/controls",
|
||||||
|
"model_name": "DescriptionStyleModel",
|
||||||
|
"model_module_version": "1.5.0",
|
||||||
|
"state": {
|
||||||
|
"_model_module": "@jupyter-widgets/controls",
|
||||||
|
"_model_module_version": "1.5.0",
|
||||||
|
"_model_name": "DescriptionStyleModel",
|
||||||
|
"_view_count": null,
|
||||||
|
"_view_module": "@jupyter-widgets/base",
|
||||||
|
"_view_module_version": "1.2.0",
|
||||||
|
"_view_name": "StyleView",
|
||||||
|
"description_width": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"accelerator": "GPU"
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# MistralTrix\n",
|
||||||
|
"\n",
|
||||||
|
"❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne)."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "Pa8905-YsHAn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "_zIBL8IssExG"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install -q datasets trl peft bitsandbytes sentencepiece wandb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import gc\n",
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"import transformers\n",
|
||||||
|
"from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
|
||||||
|
"from datasets import load_dataset\n",
|
||||||
|
"from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training\n",
|
||||||
|
"from trl import DPOTrainer\n",
|
||||||
|
"import bitsandbytes as bnb\n",
|
||||||
|
"from google.colab import userdata\n",
|
||||||
|
"import wandb\n",
|
||||||
|
"\n",
|
||||||
|
"# Defined in the secrets tab in Google Colab\n",
|
||||||
|
"hf_token = userdata.get('huggingface')\n",
|
||||||
|
"wb_token = userdata.get('wandb')\n",
|
||||||
|
"wandb.login(key=wb_token)\n",
|
||||||
|
"\n",
|
||||||
|
"model_name = \"zyh3826/GML-Mistral-merged-v1\"\n",
|
||||||
|
"new_model = \"MistralTrix-v1\""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "YpdkZsMNylvp",
|
||||||
|
"outputId": "6c2df234-1ce7-4cd2-a7e3-567e7536319f"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_config.py:141: UserWarning: The `optimize_cuda_cache` arguement will be deprecated soon, please use `optimize_device_cache` instead.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmlabonne\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
|
||||||
|
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
|
||||||
|
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
|
||||||
|
"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Format dataset"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "d8CvUgROUDw-"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def chatml_format(example):\n",
|
||||||
|
" # Format system\n",
|
||||||
|
" if len(example['system']) > 0:\n",
|
||||||
|
" message = {\"role\": \"system\", \"content\": example['system']}\n",
|
||||||
|
" system = tokenizer.apply_chat_template([message], tokenize=False)\n",
|
||||||
|
" else:\n",
|
||||||
|
" system = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Format instruction\n",
|
||||||
|
" message = {\"role\": \"user\", \"content\": example['question']}\n",
|
||||||
|
" prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)\n",
|
||||||
|
"\n",
|
||||||
|
" # Format chosen answer\n",
|
||||||
|
" chosen = example['chosen'] + \"<|im_end|>\\n\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Format rejected answer\n",
|
||||||
|
" rejected = example['rejected'] + \"<|im_end|>\\n\"\n",
|
||||||
|
"\n",
|
||||||
|
" return {\n",
|
||||||
|
" \"prompt\": system + prompt,\n",
|
||||||
|
" \"chosen\": chosen,\n",
|
||||||
|
" \"rejected\": rejected,\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
"# Load dataset\n",
|
||||||
|
"dataset = load_dataset(\"Intel/orca_dpo_pairs\")['train']\n",
|
||||||
|
"\n",
|
||||||
|
"# Save columns\n",
|
||||||
|
"original_columns = dataset.column_names\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenizer\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
||||||
|
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||||
|
"tokenizer.padding_side = \"left\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Format dataset\n",
|
||||||
|
"dataset = dataset.map(\n",
|
||||||
|
" chatml_format,\n",
|
||||||
|
" remove_columns=original_columns\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Print sample\n",
|
||||||
|
"dataset[1]"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "MCD77GZ60DOT",
|
||||||
|
"outputId": "c7c6773c-5545-4fee-bfa3-6fa6d69c0f3f"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
|
||||||
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'prompt': '<|im_start|>system\\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\\n<|im_start|>user\\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\\n<|im_start|>assistant\\n',\n",
|
||||||
|
" 'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\\n',\n",
|
||||||
|
" 'rejected': ' Sure! Here\\'s a sentence that describes all the data you provided:\\n\\n\"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes.\"<|im_end|>\\n'}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Train model with DPO"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "DeT5eUK_UJgK"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# LoRA configuration\n",
|
||||||
|
"peft_config = LoraConfig(\n",
|
||||||
|
" r=16,\n",
|
||||||
|
" lora_alpha=16,\n",
|
||||||
|
" lora_dropout=0.05,\n",
|
||||||
|
" bias=\"none\",\n",
|
||||||
|
" task_type=\"CAUSAL_LM\",\n",
|
||||||
|
" target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Model to fine-tune\n",
|
||||||
|
"model = AutoModelForCausalLM.from_pretrained(\n",
|
||||||
|
" model_name,\n",
|
||||||
|
" torch_dtype=torch.float16,\n",
|
||||||
|
" load_in_4bit=True\n",
|
||||||
|
")\n",
|
||||||
|
"model.config.use_cache = False\n",
|
||||||
|
"\n",
|
||||||
|
"# Reference model\n",
|
||||||
|
"ref_model = AutoModelForCausalLM.from_pretrained(\n",
|
||||||
|
" model_name,\n",
|
||||||
|
" torch_dtype=torch.float16,\n",
|
||||||
|
" load_in_4bit=True\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Training arguments\n",
|
||||||
|
"training_args = TrainingArguments(\n",
|
||||||
|
" per_device_train_batch_size=4,\n",
|
||||||
|
" gradient_accumulation_steps=4,\n",
|
||||||
|
" gradient_checkpointing=True,\n",
|
||||||
|
" learning_rate=5e-5,\n",
|
||||||
|
" lr_scheduler_type=\"cosine\",\n",
|
||||||
|
" max_steps=200,\n",
|
||||||
|
" save_strategy=\"no\",\n",
|
||||||
|
" logging_steps=1,\n",
|
||||||
|
" output_dir=new_model,\n",
|
||||||
|
" optim=\"paged_adamw_32bit\",\n",
|
||||||
|
" warmup_steps=100,\n",
|
||||||
|
" bf16=True,\n",
|
||||||
|
" report_to=\"wandb\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Create DPO trainer\n",
|
||||||
|
"dpo_trainer = DPOTrainer(\n",
|
||||||
|
" model,\n",
|
||||||
|
" ref_model,\n",
|
||||||
|
" args=training_args,\n",
|
||||||
|
" train_dataset=dataset,\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" peft_config=peft_config,\n",
|
||||||
|
" beta=0.1,\n",
|
||||||
|
" max_prompt_length=1024,\n",
|
||||||
|
" max_length=1536,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Fine-tune model with DPO\n",
|
||||||
|
"dpo_trainer.train()"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "rKPILNOLR-aK"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Upload model"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "3LdhPpcrUM3H"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Save artifacts\n",
|
||||||
|
"dpo_trainer.model.save_pretrained(\"final_checkpoint\")\n",
|
||||||
|
"tokenizer.save_pretrained(\"final_checkpoint\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Flush memory\n",
|
||||||
|
"del dpo_trainer, model, ref_model\n",
|
||||||
|
"gc.collect()\n",
|
||||||
|
"torch.cuda.empty_cache()\n",
|
||||||
|
"\n",
|
||||||
|
"# Reload model in FP16 (instead of NF4)\n",
|
||||||
|
"base_model = AutoModelForCausalLM.from_pretrained(\n",
|
||||||
|
" model_name,\n",
|
||||||
|
" return_dict=True,\n",
|
||||||
|
" torch_dtype=torch.float16,\n",
|
||||||
|
")\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
||||||
|
"\n",
|
||||||
|
"# Merge base model with the adapter\n",
|
||||||
|
"model = PeftModel.from_pretrained(base_model, \"final_checkpoint\")\n",
|
||||||
|
"model = model.merge_and_unload()\n",
|
||||||
|
"\n",
|
||||||
|
"# Save model and tokenizer\n",
|
||||||
|
"model.save_pretrained(new_model)\n",
|
||||||
|
"tokenizer.save_pretrained(new_model)\n",
|
||||||
|
"\n",
|
||||||
|
"# Push them to the HF Hub\n",
|
||||||
|
"model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)\n",
|
||||||
|
"tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "h7cIvxcTfBC4"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Inference"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "G6EFsmS4UOgV"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Format prompt\n",
|
||||||
|
"message = [\n",
|
||||||
|
" {\"role\": \"system\", \"content\": \"You are a helpful assistant chatbot.\"},\n",
|
||||||
|
" {\"role\": \"user\", \"content\": \"What is a Large Language Model?\"}\n",
|
||||||
|
"]\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(new_model)\n",
|
||||||
|
"prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)\n",
|
||||||
|
"\n",
|
||||||
|
"# Create pipeline\n",
|
||||||
|
"pipeline = transformers.pipeline(\n",
|
||||||
|
" \"text-generation\",\n",
|
||||||
|
" model=new_model,\n",
|
||||||
|
" tokenizer=tokenizer\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Generate text\n",
|
||||||
|
"sequences = pipeline(\n",
|
||||||
|
" prompt,\n",
|
||||||
|
" do_sample=True,\n",
|
||||||
|
" temperature=0.7,\n",
|
||||||
|
" top_p=0.9,\n",
|
||||||
|
" num_return_sequences=1,\n",
|
||||||
|
" max_length=200,\n",
|
||||||
|
")\n",
|
||||||
|
"print(sequences[0]['generated_text'])"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/",
|
||||||
|
"height": 251,
|
||||||
|
"referenced_widgets": [
|
||||||
|
"22773c721a7c4221a9c14cd388461d4c",
|
||||||
|
"6b54841f5de1482694c360095dae3039",
|
||||||
|
"448ccbc85e624ec3b3e71931a7ee4ff6",
|
||||||
|
"173769f6f465485f8848a11bf269850b",
|
||||||
|
"60978b9b4e8348f0a71ce3e35c73bcff",
|
||||||
|
"6a38dcbaf4674b448329ac0a16587d2a",
|
||||||
|
"7eaeada2158e493189449af91f643553",
|
||||||
|
"6e32854952b340008edca0139d3471d6",
|
||||||
|
"db6d7cfcdade4b4baa213a5d0abc07d7",
|
||||||
|
"9083029642744c43b7705532cbe0cf79",
|
||||||
|
"d028a98caa13425b907ceb513119006e"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"id": "LAEUZFjvlJOv",
|
||||||
|
"outputId": "9b5720c7-49ef-45c7-e5a7-f38d64899b1e"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "display_data",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
|
||||||
|
],
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0,
|
||||||
|
"model_id": "22773c721a7c4221a9c14cd388461d4c"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1473: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
"Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"<|im_start|>system\n",
|
||||||
|
"You are a helpful assistant chatbot.<|im_end|>\n",
|
||||||
|
"<|im_start|>user\n",
|
||||||
|
"What is a Large Language Model?<|im_end|>\n",
|
||||||
|
"<|im_start|>assistant\n",
|
||||||
|
"A large language model is a type of artificial intelligence (AI) system that has been trained on vast amounts of text data. These models are designed to understand and generate human language, allowing them to perform various natural language processing tasks, such as text generation, language translation, and question answering. Large language models typically use deep learning techniques, like recurrent neural networks (RNNs) or transformers, to learn patterns and relationships in the data, enabling them to generate coherent and contextually relevant responses. The size of these models, in terms of the number of parameters and the volume of data they are trained on, plays a significant role in their ability to comprehend and produce complex language structures.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
111
README.md
Normal file
111
README.md
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
---
|
||||||
|
license: apache-2.0
|
||||||
|
language:
|
||||||
|
- en
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
dtype: bfloat16
|
||||||
|
tags:
|
||||||
|
- merge
|
||||||
|
---
|
||||||
|
|
||||||
|
# EDIT:
|
||||||
|
Always check my space for the latest benchmark results for my models!
|
||||||
|
* https://huggingface.co/spaces/CultriX/Yet_Another_LLM_Leaderboard
|
||||||
|
|
||||||
|
# Results:
|
||||||
|
T: 🟦
|
||||||
|
Model: CultriX/MistralTrix-v1 📑
|
||||||
|
Average: 73.39
|
||||||
|
ARC: 72.27
|
||||||
|
HellaSwag: 88.33
|
||||||
|
MMLU: 65.24
|
||||||
|
TruthfulQA: 70.73
|
||||||
|
Winogrande: 80.98
|
||||||
|
GSM8K: 62.77
|
||||||
|
|
||||||
|
# Edit/Disclaimer:
|
||||||
|
Currently the #1 ranked 7B LLM on the LLM Leaderboards, woah!
|
||||||
|
I did not expect that result at all and am in no way a professional when it comes to LLM's or computer science in general,
|
||||||
|
just a guy that likes to nerd about and tinker around.
|
||||||
|
|
||||||
|
For those wondering how I achieved this, the answer is that I simply attempted to apply the techniques outlined in this amazing article myself: https://towardsdatascience.com/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
|
||||||
|
Therefore, all credit basically goes to the guy who wrote that.
|
||||||
|
He offers the exact Colab notebook I used to train this model for free, as well as a really nice GitHub page I hope he doesn't mind me sharing: https://github.com/mlabonne/llm-course/
|
||||||
|
So huge thank you to him for sharing his knowledge and learning me a thing or two in the process!
|
||||||
|
|
||||||
|
# GGUF
|
||||||
|
I attempted to quantisize the model myself, which again I pretty much have no clue about, but it seems to run fine for me when I test them:
|
||||||
|
https://huggingface.co/CultriX/MistralTrix-v1-GGUF
|
||||||
|
|
||||||
|
I'll say it one more time though:
|
||||||
|
"I am a complete beginner to all of this, so if these do end up sucking don't be surprised."
|
||||||
|
|
||||||
|
You have been warned :)
|
||||||
|
|
||||||
|
# Description:
|
||||||
|
(trained on a single Colab GPU in less than a few hours)
|
||||||
|
|
||||||
|
MistralTrix-v1 is an zyh3826/GML-Mistral-merged-v1 model that has been further fine-tuned with Direct Preference Optimization (DPO) using Intel's dataset for neural-chat-7b-v3-1.
|
||||||
|
It surpasses the original model on several benchmarks (see results).
|
||||||
|
|
||||||
|
It is directly inspired by the RLHF process described by Intel/neural-chat-7b-v3-1's authors to improve performance.
|
||||||
|
I used the same dataset and reformatted it to apply the ChatML template.
|
||||||
|
|
||||||
|
The code to train this model is available on Google Colab and GitHub.
|
||||||
|
Fine-tuning took about an hour on Google Colab A-1000 GPU with 40GB VRAM.
|
||||||
|
|
||||||
|
# TRAINING SPECIFICATIONS
|
||||||
|
> LoRA configuration
|
||||||
|
peft_config = LoraConfig(
|
||||||
|
r=16,
|
||||||
|
lora_alpha=16,
|
||||||
|
lora_dropout=0.05,
|
||||||
|
bias="none",
|
||||||
|
task_type="CAUSAL_LM",
|
||||||
|
target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
|
||||||
|
)
|
||||||
|
|
||||||
|
> Model to fine-tune
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
load_in_4bit=True
|
||||||
|
)
|
||||||
|
model.config.use_cache = False
|
||||||
|
|
||||||
|
> Reference model
|
||||||
|
ref_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
load_in_4bit=True
|
||||||
|
)
|
||||||
|
|
||||||
|
> Training arguments
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
per_device_train_batch_size=4,
|
||||||
|
gradient_accumulation_steps=4,
|
||||||
|
gradient_checkpointing=True,
|
||||||
|
learning_rate=5e-5,
|
||||||
|
lr_scheduler_type="cosine",
|
||||||
|
max_steps=200,
|
||||||
|
save_strategy="no",
|
||||||
|
logging_steps=1,
|
||||||
|
output_dir=new_model,
|
||||||
|
optim="paged_adamw_32bit",
|
||||||
|
warmup_steps=100,
|
||||||
|
bf16=True,
|
||||||
|
report_to="wandb",
|
||||||
|
)
|
||||||
|
|
||||||
|
> Create DPO trainer
|
||||||
|
dpo_trainer = DPOTrainer(
|
||||||
|
model,
|
||||||
|
ref_model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=dataset,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
peft_config=peft_config,
|
||||||
|
beta=0.1,
|
||||||
|
max_prompt_length=1024,
|
||||||
|
max_length=1536,
|
||||||
|
)
|
||||||
26
config.json
Normal file
26
config.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "zyh3826/GML-Mistral-merged-v1",
|
||||||
|
"architectures": [
|
||||||
|
"MistralForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 14336,
|
||||||
|
"max_position_embeddings": 32768,
|
||||||
|
"model_type": "mistral",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 40,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_theta": 10000.0,
|
||||||
|
"sliding_window": 4096,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "float16",
|
||||||
|
"transformers_version": "4.35.2",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 32000
|
||||||
|
}
|
||||||
6
generation_config.json
Normal file
6
generation_config.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"transformers_version": "4.35.2"
|
||||||
|
}
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:bbeee2819e4689ba08b2e3003e2fb4fc8641f1cbfaf29b2417f1ff272a5a89e7
|
||||||
|
size 4943162240
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:91f4a6f841456249d24331d001d5d949cf7fbd95d38fb7f12128d507a17d2f17
|
||||||
|
size 4999819232
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fc953409846ac5f4f5055d6e5a1d6c6a45c9ea905bcda9499b625a317bf3c6a1
|
||||||
|
size 4915916080
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8f036303d4abd2ed53858c59ed47ca1385420b3b8cd7728b8da4cbac927435e8
|
||||||
|
size 3114400496
|
||||||
370
model.safetensors.index.json
Normal file
370
model.safetensors.index.json
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_size": 17973256192
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.36.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.37.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.38.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.39.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.norm.weight": "model-00004-of-00004.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
28
special_tokens_map.json
Normal file
28
special_tokens_map.json
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<unk>",
|
||||||
|
"<s>",
|
||||||
|
"</s>"
|
||||||
|
],
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
91122
tokenizer.json
Normal file
91122
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tokenizer.model
(Stored with Git LFS)
Normal file
BIN
tokenizer.model
(Stored with Git LFS)
Normal file
Binary file not shown.
46
tokenizer_config.json
Normal file
46
tokenizer_config.json
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"0": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"1": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<unk>",
|
||||||
|
"<s>",
|
||||||
|
"</s>"
|
||||||
|
],
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"legacy": true,
|
||||||
|
"model_max_length": 1000000000000000019884624838656,
|
||||||
|
"pad_token": null,
|
||||||
|
"padding_side": "left",
|
||||||
|
"sp_model_kwargs": {},
|
||||||
|
"spaces_between_special_tokens": false,
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "LlamaTokenizer",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
"use_default_system_prompt": true
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user