This commit is contained in:
zhousha
2025-08-06 15:38:55 +08:00
parent 4916ad0fe0
commit 55a67e817e
193 changed files with 51647 additions and 1 deletions

View File

@@ -0,0 +1,53 @@
import os
import sys
from collections import defaultdict
import yaml
def main(dataset_dir):
dirs = os.listdir(dataset_dir)
dirs = list(
filter(lambda x: os.path.isdir(os.path.join(dataset_dir, x)), dirs)
)
problem_dirs = set()
problem_count = defaultdict(int)
for dir in dirs:
with open(os.path.join(dataset_dir, dir, "data.yaml"), "r") as f:
data = yaml.full_load(f)
for query_i, query in enumerate(data["query_data"]):
voices = sorted(query["voice"], key=lambda x: x["start"])
if voices != query["voice"]:
print("-----", dir)
if voices[0]["start"] > voices[0]["end"]:
print(
"err1: %s%s个query的第%d个voice的start大于end: %s"
% (dir, query_i, 0, voices[0]["answer"])
)
problem_dirs.add(dir)
for voice_i in range(1, len(voices)):
voice = voices[voice_i]
if voice["start"] > voice["end"]:
print(
"err1: %s%s个query的第%d个voice的start大于end: %s"
% (dir, query_i, voice_i, voice["answer"])
)
problem_dirs.add(dir)
if voice["start"] < voices[voice_i - 1]["end"]:
print(
"err2: %s%s个query的第%d个voice的start小于前一个voice的end: %s"
% (dir, query_i, voice_i, voice["answer"])
)
problem_dirs.add(dir)
problem_count[dir] += 1
print(len(dirs))
print(problem_dirs)
print(problem_count)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("指定 测试数据集文件夹")
sys.exit(1)
main(sys.argv[1])

View File

@@ -0,0 +1,108 @@
import json
import os
import shutil
import sys
import zipfile
import yaml
"""
target
{
"global": {
"lang": ""
},
"query_data": [
"file": "",
"duration": 2.0,
"voice": [
{
"answer": "",
"start": 0.0,
"end": 1.0
}
]
]
}
"""
def situation_a(meta, dataset_folder, output_folder):
"""
{
"combined": {
"en": [
{
"wav": "*.wav",
"transcriptions": [
{
"text": "",
"start": 0.0,
"end": 1.0
}
],
"duration": 2.0
}
]
}
}
"""
meta = meta["combined"]
for lang, arr in meta.items():
print("processing", lang)
assert len(lang) == 2
lang_folder = os.path.join(output_folder, lang)
os.makedirs(lang_folder, exist_ok=True)
data = {"global": {"lang": lang}, "query_data": []}
query_data = data["query_data"]
for item in arr:
os.makedirs(
os.path.join(lang_folder, os.path.dirname(item["wav"])),
exist_ok=True,
)
mp3_file = item["wav"][:-4] + ".mp3"
shutil.copyfile(
os.path.join(dataset_folder, mp3_file),
os.path.join(lang_folder, mp3_file),
)
query_data_item = {
"file": mp3_file,
"duration": float(item["duration"]),
"voice": [],
}
query_data.append(query_data_item)
voice = query_data_item["voice"]
for v in item["transcriptions"]:
voice.append(
{
"answer": v["text"],
"start": float(v["start"]),
"end": float(v["end"]),
}
)
with open(os.path.join(lang_folder, "data.yaml"), "w") as f:
yaml.dump(data, f, indent=2, allow_unicode=True, encoding="utf-8")
with zipfile.ZipFile(
os.path.join(output_folder, lang + ".zip"), "w"
) as ziper:
dirname = lang_folder
for path, _, files in os.walk(dirname):
for file in files:
ziper.write(
os.path.join(path, file),
os.path.join(path[len(dirname) :], file),
zipfile.ZIP_DEFLATED,
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("指定 数据集文件夹路径 输出路径")
sys.exit(1)
dataset_folder = sys.argv[1]
output_folder = sys.argv[2]
with open(os.path.join(dataset_folder, "meta.json")) as f:
meta = json.load(f)
situation_a(meta, dataset_folder, output_folder)

View File

@@ -0,0 +1,56 @@
import json
import sys
from schemas.dataset import QueryData
from schemas.stream import StreamDataModel
from utils.evaluator_plus import evaluate_editops
def main(detailcase_file: str):
with open(detailcase_file) as f:
d = json.load(f)[0]
preds = d["preds"]
preds = list(map(lambda x: StreamDataModel(**x), preds))
preds = list(filter(lambda x: x.final_result, preds))
label = d["label"]
label = QueryData(**label)
print(evaluate_editops(label, preds))
def evaluate_from_record(detailcase_file: str, record_path: str):
with open(detailcase_file) as f:
d = json.load(f)[0]
label = d["label"]
label = QueryData(**label)
with open(record_path) as f:
record = json.load(f)
tokens_pred = record["tokens_pred"]
tokens_label = record["tokens_label"]
recognition_results = record["recognition_results"]
recognition_results = list(
map(lambda x: StreamDataModel(**x), recognition_results)
)
a, b = [], []
for i, rr in enumerate(recognition_results):
if rr.final_result:
a.append(tokens_pred[i])
b.append(rr)
tokens_pred = a
recognition_results = b
print(
evaluate_editops(
label,
recognition_results,
tokens_pred,
tokens_label,
)
)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("请指定 detailcase 文件路径")
sys.exit(1)
main(sys.argv[1])
# evaluate_from_record(sys.argv[1], sys.argv[2])