初始化项目,由ModelHub XC社区提供模型
Model: npc-worldwide/TinyTimV1 Source: Original Platform
This commit is contained in:
23
process_wake.py
Normal file
23
process_wake.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
def is_page_number(line):
|
||||
return line.strip().isdigit()
|
||||
with open("./finn_wake.txt", "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
filtered_lines = [line for line in lines if not is_page_number(line)]
|
||||
text = ''.join(filtered_lines)
|
||||
from datasets import Dataset
|
||||
import pandas as pd
|
||||
def split_paragraph_into_smaller_parts(paragraph, max_length=100):
|
||||
"""Split a paragraph into smaller parts with a maximum length in words."""
|
||||
words = paragraph.split()
|
||||
for i in range(0, len(words), max_length):
|
||||
yield ' '.join(words[i:i+max_length])
|
||||
paragraphs = text.split('\n')
|
||||
split_paragraphs = []
|
||||
for paragraph in paragraphs:
|
||||
if paragraph.strip() != "":
|
||||
split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100))
|
||||
df = pd.DataFrame(split_paragraphs, columns=['text'])
|
||||
dataset = Dataset.from_pandas(df)
|
||||
df.to_csv('finn_wake.csv', index=False)
|
||||
dataset.save_to_disk('finn_wake_dataset')
|
||||
Reference in New Issue
Block a user