import os import subprocess from enum import Enum from typing import List from utils.logger import logger class TokenizerType(str, Enum): word = "word" whitespace = "whitespace" class LangType(str, Enum): zh = "zh" en = "en" TOKENIZER_MAPPING = dict() TOKENIZER_MAPPING['zh'] = TokenizerType.word TOKENIZER_MAPPING['en'] = TokenizerType.whitespace TOKENIZER_MAPPING['ru'] = TokenizerType.whitespace TOKENIZER_MAPPING['ar'] = TokenizerType.whitespace TOKENIZER_MAPPING['tr'] = TokenizerType.whitespace TOKENIZER_MAPPING['es'] = TokenizerType.whitespace TOKENIZER_MAPPING['pt'] = TokenizerType.whitespace TOKENIZER_MAPPING['id'] = TokenizerType.whitespace TOKENIZER_MAPPING['he'] = TokenizerType.whitespace TOKENIZER_MAPPING['ja'] = TokenizerType.word TOKENIZER_MAPPING['pl'] = TokenizerType.whitespace TOKENIZER_MAPPING['de'] = TokenizerType.whitespace TOKENIZER_MAPPING['fr'] = TokenizerType.whitespace TOKENIZER_MAPPING['nl'] = TokenizerType.whitespace TOKENIZER_MAPPING['el'] = TokenizerType.whitespace TOKENIZER_MAPPING['vi'] = TokenizerType.whitespace TOKENIZER_MAPPING['th'] = TokenizerType.whitespace TOKENIZER_MAPPING['it'] = TokenizerType.whitespace TOKENIZER_MAPPING['fa'] = TokenizerType.whitespace TOKENIZER_MAPPING['ti'] = TokenizerType.word class Tokenizer: @classmethod def norm_and_tokenize(cls, sentences: List[str], lang: LangType = None): sentences = cls.norm(sentences, lang) return cls.tokenize(sentences, lang) @classmethod def tokenize(cls, sentences: List[str], lang: LangType = None): tokenizer = TOKENIZER_MAPPING.get(lang, None) if tokenizer == TokenizerType.word: return [[ch for ch in sentence] for sentence in sentences] elif tokenizer == TokenizerType.whitespace: return [sentence.split() for sentence in sentences] else: logger.error("找不到对应的分词器") exit(-1) @classmethod def norm(cls, sentences: List[str], lang: LangType = None): if lang == LangType.zh: from utils.speechio import textnorm_zh as textnorm normalizer = textnorm.TextNorm( to_banjiao=True, to_upper=True, to_lower=False, remove_fillers=True, remove_erhua=False, # 这里同批量识别不同,改成了 False check_chars=False, remove_space=False, cc_mode="", ) return [normalizer(sentence) for sentence in sentences] elif lang == LangType.en: pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) with open('./predict.txt', 'w', encoding='utf-8') as fp: for idx, sentence in enumerate(sentences): fp.write('%s\t%s\n' % (idx, sentence)) subprocess.run( f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt', shell=True, check=True, ) sentence_norm = [] with open('./predict_norm.txt', 'r', encoding='utf-8') as fp: for line in fp.readlines(): line_split_result = line.strip().split('\t', 1) if len(line_split_result) >= 2: sentence_norm.append(line_split_result[1]) else: sentence_norm.append("") # 有可能没有 norm 后就没了 return sentence_norm else: punc = "!?。"#$%&'()*+,-/:;<=>[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏.`! #$%^&*()_+-=|';\":/.,?><~·!#¥%……&*()——+-=“:’;、。,?》《{}" return [sentence.translate(str.maketrans(dict.fromkeys(punc, " "))).lower() for sentence in sentences]