Files
enginex-bi_series-vc-cnn/scripts/convert_callback_dataset.py
zhousha 55a67e817e update
2025-08-06 15:38:55 +08:00

109 lines
3.0 KiB
Python

import json
import os
import shutil
import sys
import zipfile
import yaml
"""
target
{
"global": {
"lang": ""
},
"query_data": [
"file": "",
"duration": 2.0,
"voice": [
{
"answer": "",
"start": 0.0,
"end": 1.0
}
]
]
}
"""
def situation_a(meta, dataset_folder, output_folder):
"""
{
"combined": {
"en": [
{
"wav": "*.wav",
"transcriptions": [
{
"text": "",
"start": 0.0,
"end": 1.0
}
],
"duration": 2.0
}
]
}
}
"""
meta = meta["combined"]
for lang, arr in meta.items():
print("processing", lang)
assert len(lang) == 2
lang_folder = os.path.join(output_folder, lang)
os.makedirs(lang_folder, exist_ok=True)
data = {"global": {"lang": lang}, "query_data": []}
query_data = data["query_data"]
for item in arr:
os.makedirs(
os.path.join(lang_folder, os.path.dirname(item["wav"])),
exist_ok=True,
)
mp3_file = item["wav"][:-4] + ".mp3"
shutil.copyfile(
os.path.join(dataset_folder, mp3_file),
os.path.join(lang_folder, mp3_file),
)
query_data_item = {
"file": mp3_file,
"duration": float(item["duration"]),
"voice": [],
}
query_data.append(query_data_item)
voice = query_data_item["voice"]
for v in item["transcriptions"]:
voice.append(
{
"answer": v["text"],
"start": float(v["start"]),
"end": float(v["end"]),
}
)
with open(os.path.join(lang_folder, "data.yaml"), "w") as f:
yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8")
with zipfile.ZipFile(
os.path.join(output_folder, lang + ".zip"), "w"
) as ziper:
dirname = lang_folder
for path, _, files in os.walk(dirname):
for file in files:
ziper.write(
os.path.join(path, file),
os.path.join(path[len(dirname) :], file),
zipfile.ZIP_DEFLATED,
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("指定 数据集文件夹路径 输出路径")
sys.exit(1)
dataset_folder = sys.argv[1]
output_folder = sys.argv[2]
with open(os.path.join(dataset_folder, "meta.json")) as f:
meta = json.load(f)
situation_a(meta, dataset_folder, output_folder)