109 lines
3.0 KiB
Python
109 lines
3.0 KiB
Python
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import zipfile
|
|
|
|
import yaml
|
|
|
|
"""
|
|
target
|
|
{
|
|
"global": {
|
|
"lang": ""
|
|
},
|
|
"query_data": [
|
|
"file": "",
|
|
"duration": 2.0,
|
|
"voice": [
|
|
{
|
|
"answer": "",
|
|
"start": 0.0,
|
|
"end": 1.0
|
|
}
|
|
]
|
|
]
|
|
}
|
|
"""
|
|
|
|
|
|
def situation_a(meta, dataset_folder, output_folder):
|
|
"""
|
|
{
|
|
"combined": {
|
|
"en": [
|
|
{
|
|
"wav": "*.wav",
|
|
"transcriptions": [
|
|
{
|
|
"text": "",
|
|
"start": 0.0,
|
|
"end": 1.0
|
|
}
|
|
],
|
|
"duration": 2.0
|
|
}
|
|
]
|
|
}
|
|
}
|
|
"""
|
|
meta = meta["combined"]
|
|
|
|
for lang, arr in meta.items():
|
|
print("processing", lang)
|
|
assert len(lang) == 2
|
|
lang_folder = os.path.join(output_folder, lang)
|
|
os.makedirs(lang_folder, exist_ok=True)
|
|
data = {"global": {"lang": lang}, "query_data": []}
|
|
query_data = data["query_data"]
|
|
for item in arr:
|
|
os.makedirs(
|
|
os.path.join(lang_folder, os.path.dirname(item["wav"])),
|
|
exist_ok=True,
|
|
)
|
|
mp3_file = item["wav"][:-4] + ".mp3"
|
|
shutil.copyfile(
|
|
os.path.join(dataset_folder, mp3_file),
|
|
os.path.join(lang_folder, mp3_file),
|
|
)
|
|
query_data_item = {
|
|
"file": mp3_file,
|
|
"duration": float(item["duration"]),
|
|
"voice": [],
|
|
}
|
|
query_data.append(query_data_item)
|
|
voice = query_data_item["voice"]
|
|
for v in item["transcriptions"]:
|
|
voice.append(
|
|
{
|
|
"answer": v["text"],
|
|
"start": float(v["start"]),
|
|
"end": float(v["end"]),
|
|
}
|
|
)
|
|
with open(os.path.join(lang_folder, "data.yaml"), "w") as f:
|
|
yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8")
|
|
with zipfile.ZipFile(
|
|
os.path.join(output_folder, lang + ".zip"), "w"
|
|
) as ziper:
|
|
dirname = lang_folder
|
|
for path, _, files in os.walk(dirname):
|
|
for file in files:
|
|
ziper.write(
|
|
os.path.join(path, file),
|
|
os.path.join(path[len(dirname) :], file),
|
|
zipfile.ZIP_DEFLATED,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
print("指定 数据集文件夹路径 输出路径")
|
|
sys.exit(1)
|
|
dataset_folder = sys.argv[1]
|
|
output_folder = sys.argv[2]
|
|
|
|
with open(os.path.join(dataset_folder, "meta.json")) as f:
|
|
meta = json.load(f)
|
|
situation_a(meta, dataset_folder, output_folder)
|