初始化项目,由ModelHub XC社区提供模型
Model: npc-worldwide/TinyTimV1 Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
ggml-model-f16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||||
67
README.md
Normal file
67
README.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# TinyTimV1: Fine-tuning TinyLlama on Finnegan's Wake
|
||||||
|
|
||||||
|
A project exploring the fine-tuning of TinyLlama-1.1B on James Joyce's *Finnegan's Wake* to generate Joyce-inspired text.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This project fine-tunes the TinyLlama-1.1B-Chat model on the complete text of James Joyce's *Finnegan's Wake*, creating a language model capable of generating text in Joyce's distinctive experimental style. The model learns to replicate the complex wordplay, neologisms, and stream-of-consciousness narrative techniques characteristic of Joyce's final work.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `process_wake.py` - Preprocesses the raw text, removes page numbers, and splits into manageable chunks
|
||||||
|
- `fine_tune_joyce.py` - Main training script using HuggingFace Transformers
|
||||||
|
- `text_gen.py` - Text generation script for the fine-tuned model
|
||||||
|
- `finn_wake.txt` - Complete text of Finnegan's Wake (1.51 MB)
|
||||||
|
- `finn_wake.csv` - Processed dataset in CSV format
|
||||||
|
- `finn_wake_dataset/` - Tokenized dataset directory
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### 1. Data Preprocessing
|
||||||
|
```bash
|
||||||
|
python process_wake.py
|
||||||
|
```
|
||||||
|
This removes page numbers and splits the text into 100-word chunks for training.
|
||||||
|
2. Fine-tuning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python fine_tune_joyce.py
|
||||||
|
```
|
||||||
|
Fine-tunes TinyLlama on the processed dataset for 3 epochs with CPU training.
|
||||||
|
3. Text Generation
|
||||||
|
```bash
|
||||||
|
python text_gen.py
|
||||||
|
```
|
||||||
|
Generates Joyce-inspired text using the fine-tuned model.
|
||||||
|
|
||||||
|
Model Details
|
||||||
|
|
||||||
|
Base Model: TinyLlama-1.1B-Chat-v1.0
|
||||||
|
Training Data: Finnegan's Wake (~1.5MB text)
|
||||||
|
Training Parameters:
|
||||||
|
|
||||||
|
3 epochs
|
||||||
|
Batch size: 1
|
||||||
|
Max sequence length: 128 tokens
|
||||||
|
Temperature: 0.7
|
||||||
|
Top-k: 50, Top-p: 0.95
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Example Output
|
||||||
|
Input: "ae left to go to ireland and found a fairy"
|
||||||
|
The model generates text continuing in Joyce's experimental style with invented words, Irish references, and complex linguistic play.
|
||||||
|
Requirements
|
||||||
|
transformers
|
||||||
|
datasets
|
||||||
|
pandas
|
||||||
|
torch
|
||||||
|
Installation
|
||||||
|
bashpip install transformers datasets pandas torch
|
||||||
|
Notes
|
||||||
|
|
||||||
|
Training was performed on CPU due to resource constraints
|
||||||
|
Model checkpoints saved every 500 steps
|
||||||
|
Resume training supported from checkpoints
|
||||||
|
|
||||||
|
|
||||||
28
config.json
Normal file
28
config.json
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "tinyllama/tinyllama-1.1b-chat-v1.0",
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 2048,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 5632,
|
||||||
|
"max_position_embeddings": 2048,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 22,
|
||||||
|
"num_key_value_heads": 4,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 10000.0,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "float32",
|
||||||
|
"transformers_version": "4.37.2",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 32000
|
||||||
|
}
|
||||||
47
fine_tune_joyce.py
Normal file
47
fine_tune_joyce.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
||||||
|
from datasets import load_dataset, load_from_disk
|
||||||
|
|
||||||
|
dataset = load_from_disk('finn_wake_dataset')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
tokenizer.save_pretrained(".results/checkpoint-12000/")
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
if tokenizer.pad_token is None:
|
||||||
|
print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.")
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
||||||
|
def tokenize_function(examples):
|
||||||
|
|
||||||
|
tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
|
||||||
|
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
|
||||||
|
|
||||||
|
return tokenized_inputs
|
||||||
|
|
||||||
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
||||||
|
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
|
||||||
|
|
||||||
|
train_dataset = train_test_split['train']
|
||||||
|
eval_dataset = train_test_split['test']
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
output_dir="./results",
|
||||||
|
num_train_epochs=3,
|
||||||
|
per_device_train_batch_size=1,
|
||||||
|
warmup_steps=500,
|
||||||
|
weight_decay=0.01,
|
||||||
|
logging_dir="./logs",
|
||||||
|
logging_steps=10,
|
||||||
|
save_strategy="steps",
|
||||||
|
save_steps=500,
|
||||||
|
save_total_limit=2,
|
||||||
|
use_cpu=True)
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
)
|
||||||
|
#below has been modified because i ran out of disk storage initially so had to resume and adjust the save_strategy above.
|
||||||
|
trainer.train(resume_from_checkpoint="./results/checkpoint-10000")
|
||||||
4558
finn_wake.csv
Normal file
4558
finn_wake.csv
Normal file
File diff suppressed because it is too large
Load Diff
3728
finn_wake.txt
Normal file
3728
finn_wake.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
finn_wake_dataset/cache-07333d095ed3feb5.arrow
Normal file
3
finn_wake_dataset/cache-07333d095ed3feb5.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:7799f52711777e954892b9143657a98eb4ad4412c8eb9ec8d51cf9f2d29d0960
|
||||||
|
size 33738392
|
||||||
3
finn_wake_dataset/cache-08eaee58c5a946a5.arrow
Normal file
3
finn_wake_dataset/cache-08eaee58c5a946a5.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:57d59e9370cb6130a4315fa2e751c32aab502b2f9bf80f25cb52e243364f6915
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-108bb49fa40e19cc.arrow
Normal file
3
finn_wake_dataset/cache-108bb49fa40e19cc.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fee0801181eb75905ceec32c2100b5e957535fec236e46f6268fb204b9a87f2d
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-21a914d68d7decd3.arrow
Normal file
3
finn_wake_dataset/cache-21a914d68d7decd3.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:4db4f65eb28996e319333d5b010d0881b9a6095fc04f8fd9c6eba422037813d5
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-21efcd3d18f14eb8.arrow
Normal file
3
finn_wake_dataset/cache-21efcd3d18f14eb8.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:751aada291665431650955366b4430502c33b9b2c2aaf7376eaf89abb08e9621
|
||||||
|
size 83745984
|
||||||
3
finn_wake_dataset/cache-25107278423e0b27.arrow
Normal file
3
finn_wake_dataset/cache-25107278423e0b27.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9866ffc0fbbc4c8bc780314aa2fcc9dabf43df74f71d3944b67f37ab0e794158
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-294de0f58e07010c.arrow
Normal file
3
finn_wake_dataset/cache-294de0f58e07010c.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:4af4228221d06dc079633bac9bf6f84af52752e3b569e99b5da1a4283e1c8ce4
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-3d5c944f43725d69.arrow
Normal file
3
finn_wake_dataset/cache-3d5c944f43725d69.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6ac4e76ce5d8ba937fd8ab9db955eacb62c1d2705c5a5b9765253687549e3e33
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-40ed1615346dc08d.arrow
Normal file
3
finn_wake_dataset/cache-40ed1615346dc08d.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8f10c74d395f429b9b9795271ac556ede1daf8b8c05f04986d7f54fb4aea0688
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-4160687fb3073882.arrow
Normal file
3
finn_wake_dataset/cache-4160687fb3073882.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8ad2214bd7e550cec284ffaa766b916a076a0e75fe377ce855634dbfeea467ae
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-4328f755c07fa7f2.arrow
Normal file
3
finn_wake_dataset/cache-4328f755c07fa7f2.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:549c73a76cd629ad9e36435289c0354fbd8c44dc91edcd3a9e3a2c8efa604e67
|
||||||
|
size 33738392
|
||||||
3
finn_wake_dataset/cache-461df009004232f3.arrow
Normal file
3
finn_wake_dataset/cache-461df009004232f3.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d214cac2a17f55672d3c7a182a25beb09ac4a6232f91518449bf64b4c36e4aa6
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow
Normal file
3
finn_wake_dataset/cache-4a71b21bdeaa2ec7.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9ec3d6cde0807e531ecf7b92decd38a27a3ace2c04f2190721abd9d4968d9255
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-54251f52f75d36f0.arrow
Normal file
3
finn_wake_dataset/cache-54251f52f75d36f0.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ca4cd45ca17b5d83c8c97bea2fab7ceef28b99931062c92cb1e19bbf02c63415
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-5929bf28441deefd.arrow
Normal file
3
finn_wake_dataset/cache-5929bf28441deefd.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:801a239d436b573afd7561832db31eaf9cc40c878a4bc6aeccbdf9913df699e9
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-5be06efb6c2a7249.arrow
Normal file
3
finn_wake_dataset/cache-5be06efb6c2a7249.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9a992fa9985e1e353bb8ade640fbfff72e2e75db5a2a72d8bec3ba5a258ba99c
|
||||||
|
size 33832
|
||||||
3
finn_wake_dataset/cache-84a914c4d2cc8d93.arrow
Normal file
3
finn_wake_dataset/cache-84a914c4d2cc8d93.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ad6c49e3bdb4f56e85c5f62077186a62d17646acc4d8fc1440cfd4317e1278d9
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-8cc3936385189eb8.arrow
Normal file
3
finn_wake_dataset/cache-8cc3936385189eb8.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1b077a811daba8e024ca0e53fcc08565f1f2d676e586f39b25f88881b45adc60
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-8f8de0a505da9b57.arrow
Normal file
3
finn_wake_dataset/cache-8f8de0a505da9b57.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9ace69e62138b3125ae070c00efafd2193e23e6cf52199a49523852169bca17d
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-93f1332b83d2a02e.arrow
Normal file
3
finn_wake_dataset/cache-93f1332b83d2a02e.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1a90517adc17bac71edd4db47e7fe7780687ba2a70711111cd43fd69a6592fe9
|
||||||
|
size 4096
|
||||||
3
finn_wake_dataset/cache-a463b77c9c10fcdd.arrow
Normal file
3
finn_wake_dataset/cache-a463b77c9c10fcdd.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b236cc0e50d9affe1746fc1530b3e99a9d572acce641e26d0c5ef4cb164af744
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-b41c1b1ef780b910.arrow
Normal file
3
finn_wake_dataset/cache-b41c1b1ef780b910.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0e26f42fdfc293b91e57c84890b675de30bbdb88238775d0addbbaf488e87335
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-ba9518a20dfb972e.arrow
Normal file
3
finn_wake_dataset/cache-ba9518a20dfb972e.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:3967fa682ccf74313b48cab8a799cde75a4dd48f7324cdb0361feef6d42d12eb
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-dda16c8e4395755d.arrow
Normal file
3
finn_wake_dataset/cache-dda16c8e4395755d.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fe7a51ecd1cb6d50858465c20d310f58a8ca6428dc041313d4c84bf7ba333c74
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow
Normal file
3
finn_wake_dataset/cache-e1c4ec6e99052dfc.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:462b68fef59b237820edd0a402230fb1e3953f1e3572977cf15b2e0ad46400a3
|
||||||
|
size 5271744
|
||||||
3
finn_wake_dataset/cache-e6f91d075c0b5063.arrow
Normal file
3
finn_wake_dataset/cache-e6f91d075c0b5063.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:cb7be6b6fbca4557816f0dd00b3ac3fcfbbe0a08a2c93b851455ae642b2e116a
|
||||||
|
size 7640328
|
||||||
3
finn_wake_dataset/cache-e79c82020953bbef.arrow
Normal file
3
finn_wake_dataset/cache-e79c82020953bbef.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:04731bd4add6cb26e32f9e697f5764f421f64e62e774b13371743ed48327f4c0
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-f135f63843848cc4.arrow
Normal file
3
finn_wake_dataset/cache-f135f63843848cc4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:e4a87a79fea2a4dac7a8607f5aab278894807282427da346899de8d52b132100
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-f39a662b6194c6e4.arrow
Normal file
3
finn_wake_dataset/cache-f39a662b6194c6e4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:8a591977c03ca3bde5d1916489df376e89bcce387d660394adfaf90448d5bc4a
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-f7fe086af672c971.arrow
Normal file
3
finn_wake_dataset/cache-f7fe086af672c971.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:36a242fdf45fcea8a806a8df21299049d99a2c6a3489a9a6ddfdab82626df1ef
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-fb842fe1846f33ac.arrow
Normal file
3
finn_wake_dataset/cache-fb842fe1846f33ac.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fb719489a9a6868c7f28d4e237e1d7ad516b74333fc2adff4dea6010d26e381b
|
||||||
|
size 23368
|
||||||
3
finn_wake_dataset/cache-fda59bdb198b3ef4.arrow
Normal file
3
finn_wake_dataset/cache-fda59bdb198b3ef4.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d9bd80aa28367576aaa834a272d4a192d554d2fc3f70914d1b53db9851f04231
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/cache-fefdfd5378240940.arrow
Normal file
3
finn_wake_dataset/cache-fefdfd5378240940.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:867fbcea23ec00b23b23e36e1522a1795d6aa179f27d098fae2973fe3b3ba38c
|
||||||
|
size 2968
|
||||||
3
finn_wake_dataset/data-00000-of-00001.arrow
Normal file
3
finn_wake_dataset/data-00000-of-00001.arrow
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:f3684e77cfcc0fdcc1a627db2f5caa5f306f61f70ee4d2434f205293fc491037
|
||||||
|
size 1321944
|
||||||
12
finn_wake_dataset/dataset_info.json
Normal file
12
finn_wake_dataset/dataset_info.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"citation": "",
|
||||||
|
"description": "",
|
||||||
|
"features": {
|
||||||
|
"text": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"homepage": "",
|
||||||
|
"license": ""
|
||||||
|
}
|
||||||
13
finn_wake_dataset/state.json
Normal file
13
finn_wake_dataset/state.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"_data_files": [
|
||||||
|
{
|
||||||
|
"filename": "data-00000-of-00001.arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"_fingerprint": "953e00a2598c0e70",
|
||||||
|
"_format_columns": null,
|
||||||
|
"_format_kwargs": {},
|
||||||
|
"_format_type": null,
|
||||||
|
"_output_all_columns": false,
|
||||||
|
"_split": null
|
||||||
|
}
|
||||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": 2,
|
||||||
|
"max_length": 2048,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"transformers_version": "4.37.2"
|
||||||
|
}
|
||||||
3
ggml-model-f16.gguf
Normal file
3
ggml-model-f16.gguf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:09ae44fe0ac6ed4c693a089f85c1b32fa774ba43b4fbffe73a06e4e74175bf12
|
||||||
|
size 2201017344
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1f63f11b47b09bb6ff12cff716dea1240718e328d2741beb5cfe27feb51b931b
|
||||||
|
size 4400216536
|
||||||
3
optimizer.pt
Normal file
3
optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:343949e9115ed68d9e55d79d24e36360fcf8e27cb05da7490477c2be68e90e1c
|
||||||
|
size 8800555534
|
||||||
23
process_wake.py
Normal file
23
process_wake.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import re
|
||||||
|
def is_page_number(line):
|
||||||
|
return line.strip().isdigit()
|
||||||
|
with open("./finn_wake.txt", "r", encoding="utf-8") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
filtered_lines = [line for line in lines if not is_page_number(line)]
|
||||||
|
text = ''.join(filtered_lines)
|
||||||
|
from datasets import Dataset
|
||||||
|
import pandas as pd
|
||||||
|
def split_paragraph_into_smaller_parts(paragraph, max_length=100):
|
||||||
|
"""Split a paragraph into smaller parts with a maximum length in words."""
|
||||||
|
words = paragraph.split()
|
||||||
|
for i in range(0, len(words), max_length):
|
||||||
|
yield ' '.join(words[i:i+max_length])
|
||||||
|
paragraphs = text.split('\n')
|
||||||
|
split_paragraphs = []
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if paragraph.strip() != "":
|
||||||
|
split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100))
|
||||||
|
df = pd.DataFrame(split_paragraphs, columns=['text'])
|
||||||
|
dataset = Dataset.from_pandas(df)
|
||||||
|
df.to_csv('finn_wake.csv', index=False)
|
||||||
|
dataset.save_to_disk('finn_wake_dataset')
|
||||||
3
rng_state.pth
Normal file
3
rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
|
||||||
|
size 14244
|
||||||
3
scheduler.pt
Normal file
3
scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:aededd21cf97b36e3bd6c09cddfa5fcf4ec86682b9ad2cd27542eb68f799a7e7
|
||||||
|
size 1064
|
||||||
30
special_tokens_map.json
Normal file
30
special_tokens_map.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
27
text_gen.py
Normal file
27
text_gen.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
||||||
|
from datasets import load_dataset, load_from_disk
|
||||||
|
|
||||||
|
#post training
|
||||||
|
model_path = "./results/checkpoint-12000"
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_path)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0")
|
||||||
|
|
||||||
|
|
||||||
|
input_text = "ae left to go to ireland and found a fairy"
|
||||||
|
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
||||||
|
output = model.generate(
|
||||||
|
input_ids=tokenizer.encode(input_text, return_tensors="pt"),
|
||||||
|
max_length=400,
|
||||||
|
num_return_sequences=1,
|
||||||
|
temperature=0.7,
|
||||||
|
top_k=50,
|
||||||
|
top_p=0.95,
|
||||||
|
do_sample=True,
|
||||||
|
num_beams=5
|
||||||
|
)
|
||||||
|
|
||||||
|
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||||
|
print(decoded_output)
|
||||||
93391
tokenizer.json
Normal file
93391
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
3
tokenizer.model
Normal file
3
tokenizer.model
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
||||||
|
size 499723
|
||||||
42
tokenizer_config.json
Normal file
42
tokenizer_config.json
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": true,
|
||||||
|
"add_eos_token": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"0": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"1": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"content": "</s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"legacy": false,
|
||||||
|
"model_max_length": 2048,
|
||||||
|
"pad_token": "</s>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"sp_model_kwargs": {},
|
||||||
|
"tokenizer_class": "LlamaTokenizer",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
"use_default_system_prompt": false
|
||||||
|
}
|
||||||
7221
trainer_state.json
Normal file
7221
trainer_state.json
Normal file
File diff suppressed because it is too large
Load Diff
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:38629dcd88d8b9f70ec05e265d9d93a102045189b8a3015d57602fde7d693a06
|
||||||
|
size 4600
|
||||||
Reference in New Issue
Block a user