update
This commit is contained in:
53
scripts/check_dataset_time.py
Normal file
53
scripts/check_dataset_time.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def main(dataset_dir):
|
||||
dirs = os.listdir(dataset_dir)
|
||||
dirs = list(
|
||||
filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs)
|
||||
)
|
||||
|
||||
problem_dirs = set()
|
||||
problem_count = defaultdict(int)
|
||||
for dir in dirs:
|
||||
with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f:
|
||||
data = yaml.full_load(f)
|
||||
for query_i, query in enumerate(data["query_data"]):
|
||||
voices = sorted(query["voice"], key=lambda x: x["start"])
|
||||
if voices != query["voice"]:
|
||||
print("-----", dir)
|
||||
if voices[0]["start"] > voices[0]["end"]:
|
||||
print(
|
||||
"err1: %s 第%s个query的第%d个voice的start大于end: %s"
|
||||
% (dir, query_i, 0, voices[0]["answer"])
|
||||
)
|
||||
problem_dirs.add(dir)
|
||||
for voice_i in range(1, len(voices)):
|
||||
voice = voices[voice_i]
|
||||
if voice["start"] > voice["end"]:
|
||||
print(
|
||||
"err1: %s 第%s个query的第%d个voice的start大于end: %s"
|
||||
% (dir, query_i, voice_i, voice["answer"])
|
||||
)
|
||||
problem_dirs.add(dir)
|
||||
if voice["start"] < voices[voice_i - 1]["end"]:
|
||||
print(
|
||||
"err2: %s 第%s个query的第%d个voice的start小于前一个voice的end: %s"
|
||||
% (dir, query_i, voice_i, voice["answer"])
|
||||
)
|
||||
problem_dirs.add(dir)
|
||||
problem_count[dir] += 1
|
||||
print(len(dirs))
|
||||
print(problem_dirs)
|
||||
print(problem_count)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("指定 测试数据集文件夹")
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
Reference in New Issue
Block a user