import logging import os import threading import time from typing import Optional import flask import requests from werkzeug.datastructures import FileStorage app = flask.Flask(__name__) heartbeat_active = False log = logging.getLogger(__name__) log.propagate = False level = logging.INFO log.setLevel(level) formatter = logging.Formatter( "[%(asctime)s] %(levelname)s : %(pathname)s:%(lineno)d - %(message)s", "%Y-%m-%d %H:%M:%S", ) streamHandler = logging.StreamHandler() streamHandler.setLevel(level) streamHandler.setFormatter(formatter) log.addHandler(streamHandler) def heartbeat(url): global heartbeat_active if heartbeat_active: return heartbeat_active = True while True: try: requests.post(url, json={"status": "RUNNING"}) except Exception: pass time.sleep(10) def asr( audio_file: FileStorage, language: Optional[str], progressCallbackUrl: str, taskId: str, ): """TODO: 读取audio_file, 调用语音识别服务, 实时返回识别结果""" # ignore BEGIN # 此处为榜单本地测试使用 if os.getenv("LOCAL_TEST"): return local_test(progressCallbackUrl, taskId) # ignore END language = "de" # 某一次识别返回 requests.post( progressCallbackUrl, json={ "taskId": taskId, "status": "RUNNING", "recognition_results": { # 传增量结果, status如果是FINISHED, 或者ERROR, 这个字段请不要传值 "text": "最先启动的还是", "final_result": True, "para_seq": 0, "language": language, "start_time": 6300, "end_time": 6421, "words": [ { "text": "最", "start_time": 6300, "end_time": 6321, }, { "text": "先", "start_time": 6321, "end_time": 6345, }, { "text": "启", "start_time": 6345, "end_time": 6350, }, { "text": "动", "start_time": 6350, "end_time": 6370, }, { "text": "的", "start_time": 6370, "end_time": 6386, }, { "text": "还", "start_time": 6386, "end_time": 6421, }, { "text": "是", "start_time": 6421, "end_time": 6435, }, ], }, }, ) # ... 识别结果返回完毕 # 识别结束 requests.post( progressCallbackUrl, json={ "taskId": taskId, "status": "FINISHED", }, ) @app.post("/predict") def predict(): body = flask.request.form language = body.get("language") if language is None: "自行判断语种" taskId = body["taskId"] progressCallbackUrl = body["progressCallbackUrl"] heartbeatUrl = body["heartbeatUrl"] threading.Thread( target=heartbeat, args=(heartbeatUrl,), daemon=True ).start() audio_file = flask.request.files["file"] # audio_file.stream # 读取文件流 # audio_file.save("audio.mp3") # 保存文件 threading.Thread( target=asr, args=(audio_file, language, progressCallbackUrl, taskId), daemon=True, ).start() return flask.jsonify({"status": "OK"}) # ignore BEGIN def local_test(progressCallbackUrl: str, taskId: str): """忽略此方法, 此方法为榜单本地调试使用""" import random import re import yaml def callback(content): try: if content is None: requests.post( progressCallbackUrl, json={"taskId": taskId, "status": "FINISHED"}, ) else: requests.post( progressCallbackUrl, json={ "taskId": taskId, "status": "RUNNING", "recognition_results": content, }, ) except Exception: pass with open( os.getenv("LOCAL_TEST_DATA_PATH", "../dataset/out/data.yaml") ) as f: data = yaml.full_load(f) voices = data["query_data"][0]["voice"] # 首次发送 first_send_time = random.randint(3, 5) send_interval = random.random() * 0 log.info("首次发送%ss 发送间隔%ss" % (first_send_time, send_interval)) time.sleep(first_send_time) # 将句子拼接到一起 if random.random() < 0.3: log.info("将部分句子合并成单句 每次合并的句子不超过3句") rand_idx = 0 rand_sep = [0, len(voices) - 1] while rand_sep[rand_idx] + 1 <= rand_sep[rand_idx + 1] - 1: rand_cursep = random.randint( rand_sep[rand_idx] + 1, min(rand_sep[rand_idx + 1] - 1, rand_sep[rand_idx] + 1 + 3), ) rand_sep.insert(rand_idx + 1, rand_cursep) rand_idx += 1 merged_voices = [] for i, cur_sep in enumerate(rand_sep[:-1]): voice = voices[cur_sep] for j in range(cur_sep + 1, rand_sep[i + 1]): voice["answer"] += voices[j]["answer"] voice["end"] = voices[j]["end"] merged_voices.append(voice) merged_voices.append(voices[rand_sep[-1]]) voices = merged_voices def split_and_keep(text, delimiters): # 构建正则表达式模式,匹配文本或分隔符 pattern = "|".join(re.escape(delimiter) for delimiter in delimiters) pattern = f"(?:[^{pattern}]+|[{pattern}])" return re.findall(pattern, text) puncs = [",", ".", "?", "!", ";", ":"] para_seq = 0 for voice in voices: answer: str = voice["answer"] start_time: float = voice["start"] end_time: float = voice["end"] words = split_and_keep(answer, puncs) temp_words = [] for i, word in enumerate(words): if i > 0 and i < len(words) - 1 and random.random() < 0.15: log.info("随机删除word") continue temp_words.extend(word.split(" ")) if len(temp_words) == 0: temp_words = words[0].split(" ") words = temp_words answer = " ".join(words) words = list(map(lambda x: x.strip(), words)) words = list(filter(lambda x: len(x) > 0, words)) # 将时间均匀分配到每个字上 words_withtime = [] word_unittime = (end_time - start_time) / len(words) for i, word in enumerate(words): word_start = start_time + word_unittime * i word_end = word_start + word_unittime words_withtime.append( { "text": word, "start_time": word_start * 1000, "end_time": word_end * 1000, } ) # 将句子首尾的标点符号时间扩展到字上 标点符号时间为瞬间 punc_at = 0 while punc_at < len(words) and words[punc_at] in puncs: punc_at += 1 if punc_at < len(words): words_withtime[punc_at]["start_time"] = words_withtime[0][ "start_time" ] for i in range(0, punc_at): words_withtime[i]["start_time"] = words_withtime[0]["start_time"] words_withtime[i]["end_time"] = words_withtime[0]["start_time"] punc_at = len(words) - 1 while punc_at >= 0 and words[punc_at] in puncs: punc_at -= 1 if punc_at >= 0: words_withtime[punc_at]["end_time"] = words_withtime[-1]["end_time"] for i in range(punc_at + 1, len(words)): words_withtime[i]["start_time"] = ( words_withtime[-1]["end_time"] + 0.1 ) words_withtime[i]["end_time"] = words_withtime[-1]["end_time"] + 0.1 if random.random() < 0.4 and len(words_withtime) > 1: log.info("发送一次final_result=False") rand_idx = random.randint(1, len(words_withtime) - 1) recognition_result = { "text": " ".join( map(lambda x: x["text"], words_withtime[:rand_idx]) ), "final_result": False, "para_seq": para_seq, "language": "de", "start_time": start_time * 1000, "end_time": end_time * 1000, "words": words_withtime[:rand_idx], } callback(recognition_result) recognition_result = { "text": answer, "final_result": True, "para_seq": para_seq, "language": "de", "start_time": start_time * 1000, "end_time": end_time * 1000, "words": words_withtime, } callback(recognition_result) para_seq += 1 log.info("send %s" % para_seq) time.sleep(send_interval) callback(None) # ignore END if __name__ == "__main__": app.run(host="0.0.0.0", port=80)