98 lines
4.0 KiB
Python
98 lines
4.0 KiB
Python
import os
|
|
import subprocess
|
|
from enum import Enum
|
|
from typing import List
|
|
|
|
from utils.logger import logger
|
|
|
|
|
|
class TokenizerType(str, Enum):
|
|
word = "word"
|
|
whitespace = "whitespace"
|
|
|
|
|
|
class LangType(str, Enum):
|
|
zh = "zh"
|
|
en = "en"
|
|
|
|
|
|
TOKENIZER_MAPPING = dict()
|
|
TOKENIZER_MAPPING['zh'] = TokenizerType.word
|
|
TOKENIZER_MAPPING['en'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['ru'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['ar'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['tr'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['es'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['pt'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['id'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['he'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['ja'] = TokenizerType.word
|
|
TOKENIZER_MAPPING['pl'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['de'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['fr'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['nl'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['el'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['vi'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['th'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['it'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['fa'] = TokenizerType.whitespace
|
|
TOKENIZER_MAPPING['ti'] = TokenizerType.word
|
|
|
|
|
|
class Tokenizer:
|
|
@classmethod
|
|
def norm_and_tokenize(cls, sentences: List[str], lang: LangType = None):
|
|
sentences = cls.norm(sentences, lang)
|
|
return cls.tokenize(sentences, lang)
|
|
|
|
@classmethod
|
|
def tokenize(cls, sentences: List[str], lang: LangType = None):
|
|
tokenizer = TOKENIZER_MAPPING.get(lang, None)
|
|
if tokenizer == TokenizerType.word:
|
|
return [[ch for ch in sentence] for sentence in sentences]
|
|
elif tokenizer == TokenizerType.whitespace:
|
|
return [sentence.split() for sentence in sentences]
|
|
else:
|
|
logger.error("找不到对应的分词器")
|
|
exit(-1)
|
|
|
|
@classmethod
|
|
def norm(cls, sentences: List[str], lang: LangType = None):
|
|
if lang == LangType.zh:
|
|
from utils.speechio import textnorm_zh as textnorm
|
|
|
|
normalizer = textnorm.TextNorm(
|
|
to_banjiao=True,
|
|
to_upper=True,
|
|
to_lower=False,
|
|
remove_fillers=True,
|
|
remove_erhua=False, # 这里同批量识别不同,改成了 False
|
|
check_chars=False,
|
|
remove_space=False,
|
|
cc_mode="",
|
|
)
|
|
return [normalizer(sentence) for sentence in sentences]
|
|
elif lang == LangType.en:
|
|
pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
with open('./predict.txt', 'w', encoding='utf-8') as fp:
|
|
for idx, sentence in enumerate(sentences):
|
|
fp.write('%s\t%s\n' % (idx, sentence))
|
|
subprocess.run(
|
|
f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt',
|
|
shell=True,
|
|
check=True,
|
|
)
|
|
sentence_norm = []
|
|
with open('./predict_norm.txt', 'r', encoding='utf-8') as fp:
|
|
for line in fp.readlines():
|
|
line_split_result = line.strip().split('\t', 1)
|
|
if len(line_split_result) >= 2:
|
|
sentence_norm.append(line_split_result[1])
|
|
else:
|
|
sentence_norm.append("")
|
|
# 有可能没有 norm 后就没了
|
|
return sentence_norm
|
|
else:
|
|
punc = "!?。"#$%&'()*+,-/:;<=>[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏.`! #$%^&*()_+-=|';\":/.,?><~·!#¥%……&*()——+-=“:’;、。,?》《{}"
|
|
return [sentence.translate(str.maketrans(dict.fromkeys(punc, " "))).lower() for sentence in sentences]
|