初始化项目,由ModelHub XC社区提供模型
Model: npc-worldwide/TinyTimV1 Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
ggml-model-f16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
67
README.md
Normal file
67
README.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# TinyTimV1: Fine-tuning TinyLlama on Finnegan's Wake
|
||||||
|
|
||||||
|
A project exploring the fine-tuning of TinyLlama-1.1B on James Joyce's *Finnegan's Wake* to generate Joyce-inspired text.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This project fine-tunes the TinyLlama-1.1B-Chat model on the complete text of James Joyce's *Finnegan's Wake*, creating a language model capable of generating text in Joyce's distinctive experimental style. The model learns to replicate the complex wordplay, neologisms, and stream-of-consciousness narrative techniques characteristic of Joyce's final work.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `process_wake.py` - Preprocesses the raw text, removes page numbers, and splits into manageable chunks
|
||||||
|
- `fine_tune_joyce.py` - Main training script using HuggingFace Transformers
|
||||||
|
- `text_gen.py` - Text generation script for the fine-tuned model
|
||||||
|
- `finn_wake.txt` - Complete text of Finnegan's Wake (1.51 MB)
|
||||||
|
- `finn_wake.csv` - Processed dataset in CSV format
|
||||||
|
- `finn_wake_dataset/` - Tokenized dataset directory
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### 1. Data Preprocessing
|
||||||
|
```bash
|
||||||
|
python process_wake.py
|
||||||
|
```
|
||||||
|
This removes page numbers and splits the text into 100-word chunks for training.
|
||||||
|
2. Fine-tuning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python fine_tune_joyce.py
|
||||||
|
```
|
||||||
|
Fine-tunes TinyLlama on the processed dataset for 3 epochs with CPU training.
|
||||||
|
3. Text Generation
|
||||||
|
```bash
|
||||||
|
python text_gen.py
|
||||||
|
```
|
||||||
|
Generates Joyce-inspired text using the fine-tuned model.
|
||||||
|
|
||||||
|
Model Details
|
||||||
|
|
||||||
|
Base Model: TinyLlama-1.1B-Chat-v1.0
|
||||||
|
Training Data: Finnegan's Wake (~1.5MB text)
|
||||||
|
Training Parameters:
|
||||||
|
|
||||||
|
3 epochs
|
||||||
|
Batch size: 1
|
||||||
|
Max sequence length: 128 tokens
|
||||||
|
Temperature: 0.7
|
||||||
|
Top-k: 50, Top-p: 0.95
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Example Output
|
||||||
|
Input: "ae left to go to ireland and found a fairy"
|
||||||
|
The model generates text continuing in Joyce's experimental style with invented words, Irish references, and complex linguistic play.
|
||||||
|
Requirements
|
||||||
|
transformers
|
||||||
|
datasets
|
||||||
|
pandas
|
||||||
|
torch
|
||||||
|
Installation
|
||||||
|
bashpip install transformers datasets pandas torch
|
||||||
|
Notes
|
||||||
|
|
||||||
|
Training was performed on CPU due to resource constraints
|
||||||
|
Model checkpoints saved every 500 steps
|
||||||
|
Resume training supported from checkpoints
|
||||||
|
|
||||||
|
|
||||||
28
config.json
Normal file
28
config.json
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "tinyllama/tinyllama-1.1b-chat-v1.0",
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 2048,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 5632,
|
||||||
|
"max_position_embeddings": 2048,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 22,
|
||||||
|
"num_key_value_heads": 4,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 10000.0,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "float32",
|
||||||
|
"transformers_version": "4.37.2",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 32000
|
||||||
|
}
|
||||||
47
fine_tune_joyce.py
Normal file
47
fine_tune_joyce.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
||||||
|
from datasets import load_dataset, load_from_disk
|
||||||
|
|
||||||
|
dataset = load_from_disk('finn_wake_dataset')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
tokenizer.save_pretrained(".results/checkpoint-12000/")
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
if tokenizer.pad_token is None:
|
||||||
|
print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
||||||
|
def tokenize_function(examples):
|
||||||
|
|
||||||
|
tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
|
||||||
|
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
|
||||||
|
|
||||||
|
return tokenized_inputs
|
||||||
|
|
||||||
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
||||||
|
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
|
||||||
|
|
||||||
|
train_dataset = train_test_split['train']
|
||||||
|
eval_dataset = train_test_split['test']
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
output_dir="./results",
|
||||||
|
num_train_epochs=3,
|
||||||
|
per_device_train_batch_size=1,
|
||||||
|
warmup_steps=500,
|
||||||
|
weight_decay=0.01,
|
||||||
|
logging_dir="./logs",
|
||||||
|
logging_steps=10,
|
||||||
|
save_strategy="steps",
|
||||||
|
save_steps=500,
|
||||||
|
save_total_limit=2,
|
||||||
|
use_cpu=True)
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
)
|
||||||
|
#below has been modified because i ran out of disk storage initially so had to resume and adjust the save_strategy above.
|
||||||
|
trainer.train(resume_from_checkpoint="./results/checkpoint-10000")
|
||||||
4558
finn_wake.csv
Normal file
4558
finn_wake.csv
Normal file
File diff suppressed because it is too large
Load Diff
3728
finn_wake.txt
Normal file
3728
finn_wake.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
finn_wake_dataset/cache-07333d095ed3feb5.arrow
Normal file
3
finn_wake_dataset/cache-07333d095ed3feb5.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:7799f52711777e954892b9143657a98eb4ad4412c8eb9ec8d51cf9f2d29d0960
|
||||||
|
size 33738392
|
||||||
3
finn_wake_dataset/cache-08eaee58c5a946a5.arrow
Normal file
3
finn_wake_dataset/cache-08eaee58c5a946a5.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:57d59e9370cb6130a4315fa2e751c32aab502b2f9bf80f25cb52e243364f6915
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-108bb49fa40e19cc.arrow
Normal file
3
finn_wake_dataset/cache-108bb49fa40e19cc.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fee0801181eb75905ceec32c2100b5e957535fec236e46f6268fb204b9a87f2d
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-21a914d68d7decd3.arrow
Normal file
3
finn_wake_dataset/cache-21a914d68d7decd3.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:4db4f65eb28996e319333d5b010d0881b9a6095fc04f8fd9c6eba422037813d5
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-21efcd3d18f14eb8.arrow
Normal file
3
finn_wake_dataset/cache-21efcd3d18f14eb8.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:751aada291665431650955366b4430502c33b9b2c2aaf7376eaf89abb08e9621
|
||||||
|
size 83745984
|
||||||
3
finn_wake_dataset/cache-25107278423e0b27.arrow
Normal file
3
finn_wake_dataset/cache-25107278423e0b27.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9866ffc0fbbc4c8bc780314aa2fcc9dabf43df74f71d3944b67f37ab0e794158
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-294de0f58e07010c.arrow
Normal file
3
finn_wake_dataset/cache-294de0f58e07010c.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:4af4228221d06dc079633bac9bf6f84af52752e3b569e99b5da1a4283e1c8ce4
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-3d5c944f43725d69.arrow
Normal file
3
finn_wake_dataset/cache-3d5c944f43725d69.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6ac4e76ce5d8ba937fd8ab9db955eacb62c1d2705c5a5b9765253687549e3e33
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-40ed1615346dc08d.arrow
Normal file
3
finn_wake_dataset/cache-40ed1615346dc08d.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8f10c74d395f429b9b9795271ac556ede1daf8b8c05f04986d7f54fb4aea0688
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-4160687fb3073882.arrow
Normal file
3
finn_wake_dataset/cache-4160687fb3073882.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8ad2214bd7e550cec284ffaa766b916a076a0e75fe377ce855634dbfeea467ae
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-4328f755c07fa7f2.arrow
Normal file
3
finn_wake_dataset/cache-4328f755c07fa7f2.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:549c73a76cd629ad9e36435289c0354fbd8c44dc91edcd3a9e3a2c8efa604e67
|
||||||
|
size 33738392
|
||||||
3
finn_wake_dataset/cache-461df009004232f3.arrow
Normal file
3
finn_wake_dataset/cache-461df009004232f3.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d214cac2a17f55672d3c7a182a25beb09ac4a6232f91518449bf64b4c36e4aa6
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow
Normal file
3
finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9ec3d6cde0807e531ecf7b92decd38a27a3ace2c04f2190721abd9d4968d9255
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-54251f52f75d36f0.arrow
Normal file
3
finn_wake_dataset/cache-54251f52f75d36f0.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ca4cd45ca17b5d83c8c97bea2fab7ceef28b99931062c92cb1e19bbf02c63415
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-5929bf28441deefd.arrow
Normal file
3
finn_wake_dataset/cache-5929bf28441deefd.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:801a239d436b573afd7561832db31eaf9cc40c878a4bc6aeccbdf9913df699e9
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-5be06efb6c2a7249.arrow
Normal file
3
finn_wake_dataset/cache-5be06efb6c2a7249.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9a992fa9985e1e353bb8ade640fbfff72e2e75db5a2a72d8bec3ba5a258ba99c
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-84a914c4d2cc8d93.arrow
Normal file
3
finn_wake_dataset/cache-84a914c4d2cc8d93.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ad6c49e3bdb4f56e85c5f62077186a62d17646acc4d8fc1440cfd4317e1278d9
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-8cc3936385189eb8.arrow
Normal file
3
finn_wake_dataset/cache-8cc3936385189eb8.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1b077a811daba8e024ca0e53fcc08565f1f2d676e586f39b25f88881b45adc60
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-8f8de0a505da9b57.arrow
Normal file
3
finn_wake_dataset/cache-8f8de0a505da9b57.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9ace69e62138b3125ae070c00efafd2193e23e6cf52199a49523852169bca17d
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-93f1332b83d2a02e.arrow
Normal file
3
finn_wake_dataset/cache-93f1332b83d2a02e.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1a90517adc17bac71edd4db47e7fe7780687ba2a70711111cd43fd69a6592fe9
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-a463b77c9c10fcdd.arrow
Normal file
3
finn_wake_dataset/cache-a463b77c9c10fcdd.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b236cc0e50d9affe1746fc1530b3e99a9d572acce641e26d0c5ef4cb164af744
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-b41c1b1ef780b910.arrow
Normal file
3
finn_wake_dataset/cache-b41c1b1ef780b910.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0e26f42fdfc293b91e57c84890b675de30bbdb88238775d0addbbaf488e87335
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-ba9518a20dfb972e.arrow
Normal file
3
finn_wake_dataset/cache-ba9518a20dfb972e.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:3967fa682ccf74313b48cab8a799cde75a4dd48f7324cdb0361feef6d42d12eb
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-dda16c8e4395755d.arrow
Normal file
3
finn_wake_dataset/cache-dda16c8e4395755d.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fe7a51ecd1cb6d50858465c20d310f58a8ca6428dc041313d4c84bf7ba333c74
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow
Normal file
3
finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:462b68fef59b237820edd0a402230fb1e3953f1e3572977cf15b2e0ad46400a3
|
||||||
|
size 5271744
|
||||||
3
finn_wake_dataset/cache-e6f91d075c0b5063.arrow
Normal file
3
finn_wake_dataset/cache-e6f91d075c0b5063.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:cb7be6b6fbca4557816f0dd00b3ac3fcfbbe0a08a2c93b851455ae642b2e116a
|
||||||
|
size 7640328
|
||||||
3
finn_wake_dataset/cache-e79c82020953bbef.arrow
Normal file
3
finn_wake_dataset/cache-e79c82020953bbef.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:04731bd4add6cb26e32f9e697f5764f421f64e62e774b13371743ed48327f4c0
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-f135f63843848cc4.arrow
Normal file
3
finn_wake_dataset/cache-f135f63843848cc4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:e4a87a79fea2a4dac7a8607f5aab278894807282427da346899de8d52b132100
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-f39a662b6194c6e4.arrow
Normal file
3
finn_wake_dataset/cache-f39a662b6194c6e4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8a591977c03ca3bde5d1916489df376e89bcce387d660394adfaf90448d5bc4a
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-f7fe086af672c971.arrow
Normal file
3
finn_wake_dataset/cache-f7fe086af672c971.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:36a242fdf45fcea8a806a8df21299049d99a2c6a3489a9a6ddfdab82626df1ef
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-fb842fe1846f33ac.arrow
Normal file
3
finn_wake_dataset/cache-fb842fe1846f33ac.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fb719489a9a6868c7f28d4e237e1d7ad516b74333fc2adff4dea6010d26e381b
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-fda59bdb198b3ef4.arrow
Normal file
3
finn_wake_dataset/cache-fda59bdb198b3ef4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d9bd80aa28367576aaa834a272d4a192d554d2fc3f70914d1b53db9851f04231
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-fefdfd5378240940.arrow
Normal file
3
finn_wake_dataset/cache-fefdfd5378240940.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:867fbcea23ec00b23b23e36e1522a1795d6aa179f27d098fae2973fe3b3ba38c
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/data-00000-of-00001.arrow
Normal file
3
finn_wake_dataset/data-00000-of-00001.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:f3684e77cfcc0fdcc1a627db2f5caa5f306f61f70ee4d2434f205293fc491037
|
||||||
|
size 1321944
|
||||||
12
finn_wake_dataset/dataset_info.json
Normal file
12
finn_wake_dataset/dataset_info.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"citation": "",
|
||||||
|
"description": "",
|
||||||
|
"features": {
|
||||||
|
"text": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"homepage": "",
|
||||||
|
"license": ""
|
||||||
|
}
|
||||||
13
finn_wake_dataset/state.json
Normal file
13
finn_wake_dataset/state.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"_data_files": [
|
||||||
|
{
|
||||||
|
"filename": "data-00000-of-00001.arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"_fingerprint": "953e00a2598c0e70",
|
||||||
|
"_format_columns": null,
|
||||||
|
"_format_kwargs": {},
|
||||||
|
"_format_type": null,
|
||||||
|
"_output_all_columns": false,
|
||||||
|
"_split": null
|
||||||
|
}
|
||||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"max_length": 2048,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"transformers_version": "4.37.2"
|
||||||
|
}
|
||||||
3
ggml-model-f16.gguf
Normal file
3
ggml-model-f16.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:09ae44fe0ac6ed4c693a089f85c1b32fa774ba43b4fbffe73a06e4e74175bf12
|
||||||
|
size 2201017344
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1f63f11b47b09bb6ff12cff716dea1240718e328d2741beb5cfe27feb51b931b
|
||||||
|
size 4400216536
|
||||||
3
optimizer.pt
Normal file
3
optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:343949e9115ed68d9e55d79d24e36360fcf8e27cb05da7490477c2be68e90e1c
|
||||||
|
size 8800555534
|
||||||
23
process_wake.py
Normal file
23
process_wake.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import re
|
||||||
|
def is_page_number(line):
|
||||||
|
return line.strip().isdigit()
|
||||||
|
with open("./finn_wake.txt", "r", encoding="utf-8") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
filtered_lines = [line for line in lines if not is_page_number(line)]
|
||||||
|
text = ''.join(filtered_lines)
|
||||||
|
from datasets import Dataset
|
||||||
|
import pandas as pd
|
||||||
|
def split_paragraph_into_smaller_parts(paragraph, max_length=100):
|
||||||
|
"""Split a paragraph into smaller parts with a maximum length in words."""
|
||||||
|
words = paragraph.split()
|
||||||
|
for i in range(0, len(words), max_length):
|
||||||
|
yield ' '.join(words[i:i+max_length])
|
||||||
|
paragraphs = text.split('\n')
|
||||||
|
split_paragraphs = []
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if paragraph.strip() != "":
|
||||||
|
split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100))
|
||||||
|
df = pd.DataFrame(split_paragraphs, columns=['text'])
|
||||||
|
dataset = Dataset.from_pandas(df)
|
||||||
|
df.to_csv('finn_wake.csv', index=False)
|
||||||
|
dataset.save_to_disk('finn_wake_dataset')
|
||||||
3
rng_state.pth
Normal file
3
rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
|
||||||
|
size 14244
|
||||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:aededd21cf97b36e3bd6c09cddfa5fcf4ec86682b9ad2cd27542eb68f799a7e7
|
||||||
|
size 1064
|
||||||
30
special_tokens_map.json
Normal file
30
special_tokens_map.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
27
text_gen.py
Normal file
27
text_gen.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
||||||
|
from datasets import load_dataset, load_from_disk
|
||||||
|
|
||||||
|
#post training
|
||||||
|
model_path = "./results/checkpoint-12000"
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_path)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
|
||||||
|
input_text = "ae left to go to ireland and found a fairy"
|
||||||
|
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=tokenizer.encode(input_text, return_tensors="pt"),
|
||||||
|
max_length=400,
|
||||||
|
num_return_sequences=1,
|
||||||
|
temperature=0.7,
|
||||||
|
top_k=50,
|
||||||
|
top_p=0.95,
|
||||||
|
do_sample=True,
|
||||||
|
num_beams=5
|
||||||
|
)
|
||||||
|
|
||||||
|
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
|
print(decoded_output)
|
||||||
93391
tokenizer.json
Normal file
93391
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tokenizer.model
(Stored with Git LFS)
Normal file
BIN
tokenizer.model
(Stored with Git LFS)
Normal file
Binary file not shown.
42
tokenizer_config.json
Normal file
42
tokenizer_config.json
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": true,
|
||||||
|
"add_eos_token": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"0": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"1": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"legacy": false,
|
||||||
|
"model_max_length": 2048,
|
||||||
|
"pad_token": "</s>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"sp_model_kwargs": {},
|
||||||
|
"tokenizer_class": "LlamaTokenizer",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
"use_default_system_prompt": false
|
||||||
|
}
|
||||||
7221
trainer_state.json
Normal file
7221
trainer_state.json
Normal file
File diff suppressed because it is too large
Load Diff
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:38629dcd88d8b9f70ec05e265d9d93a102045189b8a3015d57602fde7d693a06
|
||||||
|
size 4600
|
||||||
Reference in New Issue
Block a user