import json import os import shutil import sys import zipfile import yaml """ target { "global": { "lang": "" }, "query_data": [ "file": "", "duration": 2.0, "voice": [ { "answer": "", "start": 0.0, "end": 1.0 } ] ] } """ def situation_a(meta, dataset_folder, output_folder): """ { "combined": { "en": [ { "wav": "*.wav", "transcriptions": [ { "text": "", "start": 0.0, "end": 1.0 } ], "duration": 2.0 } ] } } """ meta = meta["combined"] for lang, arr in meta.items(): print("processing", lang) assert len(lang) == 2 lang_folder = os.path.join(output_folder, lang) os.makedirs(lang_folder, exist_ok=True) data = {"global": {"lang": lang}, "query_data": []} query_data = data["query_data"] for item in arr: os.makedirs( os.path.join(lang_folder, os.path.dirname(item["wav"])), exist_ok=True, ) mp3_file = item["wav"][:-4] + ".mp3" shutil.copyfile( os.path.join(dataset_folder, mp3_file), os.path.join(lang_folder, mp3_file), ) query_data_item = { "file": mp3_file, "duration": float(item["duration"]), "voice": [], } query_data.append(query_data_item) voice = query_data_item["voice"] for v in item["transcriptions"]: voice.append( { "answer": v["text"], "start": float(v["start"]), "end": float(v["end"]), } ) with open(os.path.join(lang_folder, "data.yaml"), "w") as f: yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8") with zipfile.ZipFile( os.path.join(output_folder, lang + ".zip"), "w" ) as ziper: dirname = lang_folder for path, _, files in os.walk(dirname): for file in files: ziper.write( os.path.join(path, file), os.path.join(path[len(dirname) :], file), zipfile.ZIP_DEFLATED, ) if __name__ == "__main__": if len(sys.argv) < 3: print("指定 数据集文件夹路径 输出路径") sys.exit(1) dataset_folder = sys.argv[1] output_folder = sys.argv[2] with open(os.path.join(dataset_folder, "meta.json")) as f: meta = json.load(f) situation_a(meta, dataset_folder, output_folder)