import os import sys from collections import defaultdict import yaml def main(dataset_dir): dirs = os.listdir(dataset_dir) dirs = list( filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs) ) problem_dirs = set() problem_count = defaultdict(int) for dir in dirs: with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f: data = yaml.full_load(f) for query_i, query in enumerate(data["query_data"]): voices = sorted(query["voice"], key=lambda x: x["start"]) if voices != query["voice"]: print("-----", dir) if voices[0]["start"] > voices[0]["end"]: print( "err1: %s 第%s个query的第%d个voice的start大于end: %s" % (dir, query_i, 0, voices[0]["answer"]) ) problem_dirs.add(dir) for voice_i in range(1, len(voices)): voice = voices[voice_i] if voice["start"] > voice["end"]: print( "err1: %s 第%s个query的第%d个voice的start大于end: %s" % (dir, query_i, voice_i, voice["answer"]) ) problem_dirs.add(dir) if voice["start"] < voices[voice_i - 1]["end"]: print( "err2: %s 第%s个query的第%d个voice的start小于前一个voice的end: %s" % (dir, query_i, voice_i, voice["answer"]) ) problem_dirs.add(dir) problem_count[dir] += 1 print(len(dirs)) print(problem_dirs) print(problem_count) if __name__ == "__main__": if len(sys.argv) < 2: print("指定 测试数据集文件夹") sys.exit(1) main(sys.argv[1])