初始化项目，由ModelHub XC社区提供模型

Model: alpha-ai/clinical-info-corrector-1B Source: Original Platform
2026-05-13 12:48:37 +08:00
commit 8b8b026c9c
13 changed files with 2672 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,43 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+clinical-info-corrector-1B.png filter=lfs diff=lfs merge=lfs -text
+llama-3.2-1b-instruct.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+llama-3.2-1b-instruct.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+llama-3.2-1b-instruct.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+clinical-info-corrector-1B.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+clinical-info-corrector-1B.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+clinical-info-corrector-1B.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
+
+FROM llama-3.2-1b-instruct.Q5_K_M.gguf
+TEMPLATE """{{ if .Messages }}
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+{{- if .System }}
+
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the original use question.
+{{- end }}
+{{- end }}<|eot_id|>
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
+{{- if and $.Tools $last }}
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{{ $.Tools }}
+{{- end }}
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
+{{- if .ToolCalls }}
+
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
+{{- else }}
+
+{{ .Content }}{{ if not $last }}<|eot_id|>{{ end }}
+{{- end }}
+{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}{{ .Response }}{{ if .Response }}<|eot_id|>{{ end }}"""
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|eom_id|>"
+PARAMETER temperature 1.5
+PARAMETER min_p 0.1
--- a/README.md
+++ b/README.md
@@ -0,0 +1,287 @@
+---
+base_model:
+- unsloth/Llama-3.2-1B-Instruct
+tags:
+- text-generation
+- transformers
+- unsloth
+- llama-3.2
+- clinical
+- medical
+- text-cleaning
+- finetuned
+license: other
+language:
+- eng
+datasets:
+- custom/clinical-text-cleaning
+---
+
+<div align="center">
+
+# 🩺 clinical-info-corrector-1B
+
+**Finetuned Llama 3.2 1B Instruct model for cleaning clinical text (histories, notes & staff remarks)**
+
+<img src="https://huggingface.co/alphaaico/clinical-info-corrector-1B/resolve/main/clinical-info-corrector-1B.png"
+     alt="clinical-info-corrector-1B" 
+     style="width: 500px;
+            height: auto;
+            object-position: center top;">
+
+</div>
+
+---
+
+## Model Card
+
+- **Model name:** `clinical-info-corrector-1B`  
+- **Base model:** `unsloth/Llama-3.2-1B-Instruct`  
+- **Finetuning framework:** [Unsloth](https://github.com/unslothai/unsloth) + TRL `SFTTrainer`  
+- **Task:** Clinical text *cleaning & rephrasing* (not diagnosis)  
+- **Status:** Experimental — trained on a relatively small, noisy custom dataset
+
+This model is designed to take **messy clinical content** (patient histories, discharge notes, staff remarks, “adviced” sections, etc.) and rewrite it into **clear, grammatically correct, medically faithful text** while preserving the underlying facts.
+
+It is *not* a diagnostic or decision-support model. Think of it as a domain-aware “clinical copy-editor” for structured text.
+
+---
+
+## Motivation & Intended Use
+
+Many hospital information systems accumulate:
+
+- telegraphic note fragments  
+- mixed-case, ALL-CAPS, abbreviations everywhere  
+- repeated or contradictory phrases  
+- encoding artefacts (e.g., `Â`, broken bullets, etc.)
+
+The goal of `clinical-info-corrector-1B` is to explore whether even a **small 1B parameter model**, fine-tuned on a **crude but realistic dataset**, can:
+
+- clean up this text,
+- normalize style and grammar,
+- expand common medical abbreviations where helpful, and
+- keep the **clinical meaning intact**.
+
+**Primary intended uses:**
+
+- Pre-processing / normalization of:
+  - patient histories  
+  - discharge summaries  
+  - clinician notes / staff remarks  
+  - “advice” / follow-up instructions
+- Creating cleaner text for:
+  - analytics / NLP pipelines  
+  - downstream LLMs (e.g., for summarization or coding)  
+  - internal documentation
+
+**Not intended for:**
+
+- Medical diagnosis or triage
+- Treatment recommendations
+- Patient-facing communication without clinician review
+
+---
+
+## Training Data & Format
+
+The model was finetuned on a **custom clinical dataset** derived from hospital discharge summaries and related fields. Each training sample is a pair:
+
+- `content` – noisy / raw clinical text (e.g. original HISTORY, treatment, adviced, mdescript1)  
+- `response` – cleaned version of that text (grammar fixed, abbreviations expanded, structure improved, artefacts removed)  
+
+Example pair (simplified):
+
+```text
+content:
+PATIENT ADMITTED WITH ABOVE SAID COMPLAINT FOR FURTHER EVALUATION AND MANAGEMENT.  -H/O LAP CHOLECYSTECTOMY(12 YEAR BACK
+
+response:
+The patient was admitted with the above-mentioned complaints for further evaluation and management.
+The patient has a history of laparoscopic cholecystectomy performed 12 years ago.
+```
+
+During finetuning, each row was wrapped as a **chat-style conversation**:
+
+```json
+[
+  {"role": "system", "content": "<system prompt (see below)>"},
+  {"role": "user",   "content": "<content>"},
+  {"role": "assistant", "content": "<response>"}
+]
+```
+
+The Llama 3.x chat template (`llama-3.1` in Unsloth) was used to convert this into the final training text.  
+Training used **`train_on_responses_only`**, so only the assistant segment (`response`) contributed to the loss.
+
+---
+
+## System Prompt (Baked-in Behavior)
+
+During training, a system message of this form was used:
+
+```text
+You are a medical AI assistant that rewrites noisy, telegraphic, or poorly formatted clinical text
+(eg, discharge summaries, admission notes, advice sections) into clear, grammatically correct,
+contextually rich prose.
+
+Given some raw 'content' from a clinician, you must:
+- preserve all medically relevant facts and timelines,
+- avoid inventing new clinical information,
+- expand or clarify abbreviations when appropriate,
+- fix spelling, grammar, punctuation, and formatting,
+- turn lists or fragments into coherent sentences and paragraphs.
+
+Your reply should be only the cleaned clinical text, nothing else.
+```
+
+For best results, you should **re-use a similar system prompt at inference time**.
+
+---
+
+## How to Use
+
+### 1. Basic Chat-style Inference (Transformers)
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+model_name = "your-username/clinical-info-corrector-1B"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype = torch.float16,
+    device_map = "auto",
+)
+
+SYSTEM_PROMPT = """
+You are a medical AI assistant that rewrites noisy, telegraphic, or poorly formatted clinical text
+into clear, grammatically correct, medically faithful prose. Preserve all medical facts and do not
+invent new information. Your reply should be only the cleaned clinical text.
+"""
+
+raw_content = "PATIENT ADMITTED WITH ABOVE SAID COMPLAINT FOR FURTHER EVALUATION AND MANAGEMENT. -H/O LAP CHOLECYSTECTOMY(12 YEAR BACK"
+
+messages = [
+    {"role": "system", "content": SYSTEM_PROMPT},
+    {"role": "user",   "content": raw_content},
+]
+
+from unsloth.chat_templates import get_chat_template
+tokenizer = get_chat_template(tokenizer, chat_template = "llama-3.1")
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    tokenize = True,
+    add_generation_prompt = True,
+    return_tensors = "pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens = 256,
+    temperature = 0.7,
+    top_p = 0.9,
+)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens = True))
+```
+
+### 2. Recommended Prompt Pattern
+
+**User message:**
+
+> “Please clean and normalize the following clinical text. Preserve all clinical facts and do not add anything new:  
+> <raw content here>”
+
+**System message:** same as above, or a shorter variant:
+
+> “You are a clinical text cleaning assistant. Rewrite the input into clear, grammatically correct, medically faithful prose, preserving all facts.”
+
+---
+
+## Example
+
+**Input (user):**
+
+```text
+DAILY FLUID INTAKE 1.5LTR/DAY  Check CBC, RFT,K+, RBS,CXR, ECG, in follow up  Chest physiotherapy, wound hygiene & proper diet.  Steam Inhalation three times a day.  Syp Alex or TusQDx 2 spoons threee times a day if dry cough is present  FBS AND PPBS AFTER 3DAYS IF BLOOD SUGAR IS <80 OR >200mg/dl THEN CONSULT to local physician.  WHILE SPLEEPING MUST LIE SUPINE FOR ONE MONTH, CAN REMOVE BELT IN NIGHT ONLY. 
+```
+
+**Output (assistant):**
+
+> Maintain a daily fluid intake of about 1.5 L per day. At the follow-up visit, get a complete blood count (CBC), renal function tests (RFT), serum potassium (K+), random blood sugar (RBS), chest X-ray (CXR), and electrocardiogram (ECG) done. Continue chest physiotherapy, maintain good wound hygiene, and follow a nutritious diet. Do steam inhalation three times daily. If you have a dry cough, take syrup Alex or TusQDx 2 spoonfuls three times daily. Check fasting and post-meal blood sugar after 3 days; if it is less than 80 mg/dL or more than 200 mg/dL, consult your local physician. While sleeping for the next month, lie on your back (supine), and you may remove the belt only at night.
+
+*(Output is illustrative of the intended style; exact generations may vary.)*
+
+---
+
+## Training Details
+
+- **Base model:** `unsloth/Llama-3.2-1B-Instruct`
+- **Method:** Supervised fine-tuning (SFT)
+- **Library:** Unsloth + TRL `SFTTrainer`
+- **Loss masking:** `train_on_responses_only` (only assistant outputs)
+- **Context length:** 2048
+- **Objective:** Minimize loss on cleaned `response` given `content` (and system + user context)
+
+Because the dataset is **relatively small and noisy**, this model should be considered **experimental** and evaluated carefully before integration into production pipelines.
+
+---
+
+## Suggested Hyperparameters (Inference)
+
+You can tune to your own taste, but a reasonable starting point:
+
+- **Temperature:** `0.3 – 0.7`  
+- **Top-p:** `0.9`  
+- **Max new tokens:** `256–512`  
+- **Repetition penalty:** optional (`1.0–1.1`)
+
+Lower temperatures help keep the model closer to “copy-editor” mode rather than creative rewriting.
+
+---
+
+## Limitations & Risks
+
+- **Not a medical device**:  
+  This model does *not* replace clinical judgment, and should not be used for diagnosis, triage, or direct treatment recommendations.
+
+- **Data coverage**:  
+  Trained on a specific institution’s style and a limited range of note types. May not generalize to all clinical documentation formats.
+
+- **Factual hallucination**:  
+  While the training objective and system prompt encourage fidelity to the source, the model can still drop or rephrase details incorrectly. **A human clinician must review outputs.**
+
+- **Biases**:  
+  Any biases or idiosyncrasies present in the source data may be reflected in the cleaned text.
+
+---
+
+## License
+
+The `license: other` field is a placeholder.  
+Before public or commercial use, **please set the correct license** for this model and its training data in accordance with your institutional and legal requirements.
+
+---
+
+## Acknowledgments
+
+- [Unsloth](https://github.com/unslothai/unsloth) for efficient finetuning on small hardware.
+- The maintainers of Llama 3.x models for providing strong base models.
+- Clinical staff and data teams whose documentation (anonymized and processed) made it possible to experiment with this type of text-cleaning model.
+
+---
+
+## Disclaimer
+
+This model is intended **solely** for research and internal tooling around **text normalization** of clinical documentation. It must **not** be used to:
+
+- make or suggest diagnoses,  
+- decide treatment options,  
+- communicate directly with patients, or  
+- operate without human clinical oversight.
+
+Always have qualified healthcare professionals review and validate outputs before any clinical use.
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,139 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 July 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>
+
+" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython
+" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "
+
+"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023
+" }}
+{{- "Today Date: " + date_string + "
+
+" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.
+
+" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "
+
+" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content'] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>
+
+' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.
+
+" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.
+
+" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "
+
+" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'+ message['content'] + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>
+
+" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>
+
+' }}
+{%- endif %}
--- a/clinical-info-corrector-1B.Q4_K_M.gguf
+++ b/clinical-info-corrector-1B.Q4_K_M.gguf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c298ecbff60e98760a9b1cdbd10fc3395cc187a4ef9b1fd9f22bd3ff5a79e98
+size 807694848
--- a/clinical-info-corrector-1B.Q5_K_M.gguf
+++ b/clinical-info-corrector-1B.Q5_K_M.gguf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b07cbb6bd908fb053efe6a3369c925e7543fa3ad02a3f21bc7c3ee8db052c06
+size 911503872
--- a/clinical-info-corrector-1B.Q8_0.gguf
+++ b/clinical-info-corrector-1B.Q8_0.gguf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:251257da6f7c09f3d48927241b6d581304d98189bb01a5b0e3a36b33af0667c8
+size 1321083392
--- a/clinical-info-corrector-1B.png
+++ b/clinical-info-corrector-1B.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5d686cd6885d5c0cd2326b88fe9e946980bcb1d36f05c505331ae71cdc2e218
+size 2244721
--- a/clinical-info-corrector-1B.safetensors
+++ b/clinical-info-corrector-1B.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04efc4d60d64b5c7c0518e566a6c7ea71943491e6000e872b504a75fdd2ec24f
+size 2471645608
--- a/config.json
+++ b/config.json
@@ -0,0 +1,38 @@
+{
+    "architectures": [
+        "LlamaForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "torch_dtype": "float16",
+    "eos_token_id": 128009,
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "max_position_embeddings": 131072,
+    "mlp_bias": false,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 16,
+    "num_key_value_heads": 8,
+    "pad_token_id": 128004,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "factor": 32.0,
+        "high_freq_factor": 4.0,
+        "low_freq_factor": 1.0,
+        "original_max_position_embeddings": 8192,
+        "rope_type": "llama3"
+    },
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": true,
+    "transformers_version": "4.56.2",
+    "unsloth_fixed": true,
+    "unsloth_version": "2025.12.10",
+    "use_cache": true,
+    "vocab_size": 128256
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|finetune_right_pad_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json