161 lines
6.2 KiB
Python
161 lines
6.2 KiB
Python
import os
|
||
import subprocess
|
||
from enum import Enum
|
||
from typing import List
|
||
|
||
from utils.logger import logger
|
||
|
||
|
||
class TokenizerType(str, Enum):
|
||
word = "word"
|
||
whitespace = "whitespace"
|
||
|
||
|
||
class LangType(str, Enum):
|
||
zh = "zh"
|
||
en = "en"
|
||
|
||
|
||
TOKENIZER_MAPPING = dict()
|
||
TOKENIZER_MAPPING['zh'] = TokenizerType.word
|
||
TOKENIZER_MAPPING['en'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['ru'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['ar'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['tr'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['es'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['pt'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['id'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['he'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['ja'] = TokenizerType.word
|
||
TOKENIZER_MAPPING['pl'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['de'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['fr'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['nl'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['el'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['vi'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['th'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['it'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['fa'] = TokenizerType.whitespace
|
||
TOKENIZER_MAPPING['ti'] = TokenizerType.word
|
||
|
||
import nltk
|
||
|
||
import re
|
||
from nltk.tokenize import word_tokenize
|
||
from nltk.stem import WordNetLemmatizer
|
||
lemmatizer = WordNetLemmatizer()
|
||
|
||
|
||
class Tokenizer:
|
||
@classmethod
|
||
def norm_and_tokenize(cls, sentences: List[str], lang: str = None):
|
||
tokenizer = TOKENIZER_MAPPING.get(lang, None)
|
||
sentences = cls.replace_general_punc(sentences, tokenizer)
|
||
sentences = cls.norm(sentences, lang)
|
||
return cls.tokenize(sentences, lang)
|
||
|
||
@classmethod
|
||
def tokenize(cls, sentences: List[str], lang: str = None):
|
||
tokenizer = TOKENIZER_MAPPING.get(lang, None)
|
||
# sentences = cls.replace_general_punc(sentences, tokenizer)
|
||
if tokenizer == TokenizerType.word:
|
||
return [[ch for ch in sentence] for sentence in sentences]
|
||
elif tokenizer == TokenizerType.whitespace:
|
||
return [re.findall(r"\w+", sentence.lower()) for sentence in sentences]
|
||
else:
|
||
logger.error("找不到对应的分词器")
|
||
exit(-1)
|
||
|
||
@classmethod
|
||
def norm(cls, sentences: List[str], lang: LangType = None):
|
||
if lang == "zh":
|
||
from utils.speechio import textnorm_zh as textnorm
|
||
|
||
normalizer = textnorm.TextNorm(
|
||
to_banjiao=True,
|
||
to_upper=True,
|
||
to_lower=False,
|
||
remove_fillers=True,
|
||
remove_erhua=False, # 这里同批量识别不同,改成了 False
|
||
check_chars=False,
|
||
remove_space=False,
|
||
cc_mode="",
|
||
)
|
||
return [normalizer(sentence) for sentence in sentences]
|
||
elif lang == "en":
|
||
# pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
# with open('./predict.txt', 'w', encoding='utf-8') as fp:
|
||
# for idx, sentence in enumerate(sentences):
|
||
# fp.write('%s\t%s\n' % (idx, sentence))
|
||
# subprocess.run(
|
||
# f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt',
|
||
# shell=True,
|
||
# check=True,
|
||
# )
|
||
# sentence_norm = []
|
||
# with open('./predict_norm.txt', 'r', encoding='utf-8') as fp:
|
||
# for line in fp.readlines():
|
||
# line_split_result = line.strip().split('\t', 1)
|
||
# if len(line_split_result) >= 2:
|
||
# sentence_norm.append(line_split_result[1])
|
||
# else:
|
||
# sentence_norm.append("")
|
||
# # 有可能没有 norm 后就没了
|
||
# return sentence_norm
|
||
|
||
# sentence_norm = []
|
||
# for sentence in sentences:
|
||
# doc = _nlp_en(sentence)
|
||
# # 保留单词,去除标点、数字、特殊符号;做词形还原
|
||
# tokens = [token.lemma_ for token in doc if token.is_alpha]
|
||
# tokens = [t.upper() for t in tokens] # 根据你的原逻辑 to_upper=True
|
||
# sentence_norm.append(" ".join(tokens))
|
||
# return sentence_norm
|
||
result = []
|
||
for sentence in sentences:
|
||
sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
|
||
tokens = word_tokenize(sentence)
|
||
tokens = [lemmatizer.lemmatize(t) for t in tokens]
|
||
# if to_upper:
|
||
# tokens = [t.upper() for t in tokens]
|
||
result.append(" ".join(tokens))
|
||
return result
|
||
else:
|
||
punc = "!?。"#$%&'()*+,-/:;<=>[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏.`! #$%^&*()_+-=|';\":/.,?><~·!#¥%……&*()——+-=“:’;、。,?》《{}"
|
||
return [sentence.translate(str.maketrans(dict.fromkeys(punc, " "))).lower() for sentence in sentences]
|
||
|
||
@classmethod
|
||
def replace_general_punc(cls, sentences: List[str], tokenizer: TokenizerType,language:str = None) -> List[str]:
|
||
"""代替原来的函数 utils.metrics.cut_sentence"""
|
||
if language:
|
||
tokenizer = TOKENIZER_MAPPING.get(language)
|
||
general_puncs = [
|
||
"······",
|
||
"......",
|
||
"。",
|
||
",",
|
||
"?",
|
||
"!",
|
||
";",
|
||
":",
|
||
"...",
|
||
".",
|
||
",",
|
||
"?",
|
||
"!",
|
||
";",
|
||
":",
|
||
]
|
||
if tokenizer == TokenizerType.whitespace:
|
||
replacer = " "
|
||
else:
|
||
replacer = ""
|
||
trans = str.maketrans(dict.fromkeys("".join(general_puncs), replacer))
|
||
ret_sentences = [""] * len(sentences)
|
||
for i, sentence in enumerate(sentences):
|
||
sentence = sentence.translate(trans)
|
||
sentence = sentence.strip()
|
||
sentence = sentence.lower()
|
||
ret_sentences[i] = sentence
|
||
return ret_sentences
|