Files
enginex-bi_series-vc-cnn/utils/tokenizer.py
zhousha 55a67e817e update
2025-08-06 15:38:55 +08:00

98 lines
4.0 KiB
Python

import os
import subprocess
from enum import Enum
from typing import List
from utils.logger import logger
class TokenizerType(str, Enum):
word = "word"
whitespace = "whitespace"
class LangType(str, Enum):
zh = "zh"
en = "en"
TOKENIZER_MAPPING = dict()
TOKENIZER_MAPPING['zh'] = TokenizerType.word
TOKENIZER_MAPPING['en'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ru'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ar'] = TokenizerType.whitespace
TOKENIZER_MAPPING['tr'] = TokenizerType.whitespace
TOKENIZER_MAPPING['es'] = TokenizerType.whitespace
TOKENIZER_MAPPING['pt'] = TokenizerType.whitespace
TOKENIZER_MAPPING['id'] = TokenizerType.whitespace
TOKENIZER_MAPPING['he'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ja'] = TokenizerType.word
TOKENIZER_MAPPING['pl'] = TokenizerType.whitespace
TOKENIZER_MAPPING['de'] = TokenizerType.whitespace
TOKENIZER_MAPPING['fr'] = TokenizerType.whitespace
TOKENIZER_MAPPING['nl'] = TokenizerType.whitespace
TOKENIZER_MAPPING['el'] = TokenizerType.whitespace
TOKENIZER_MAPPING['vi'] = TokenizerType.whitespace
TOKENIZER_MAPPING['th'] = TokenizerType.whitespace
TOKENIZER_MAPPING['it'] = TokenizerType.whitespace
TOKENIZER_MAPPING['fa'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ti'] = TokenizerType.word
class Tokenizer:
@classmethod
def norm_and_tokenize(cls, sentences: List[str], lang: LangType = None):
sentences = cls.norm(sentences, lang)
return cls.tokenize(sentences, lang)
@classmethod
def tokenize(cls, sentences: List[str], lang: LangType = None):
tokenizer = TOKENIZER_MAPPING.get(lang, None)
if tokenizer == TokenizerType.word:
return [[ch for ch in sentence] for sentence in sentences]
elif tokenizer == TokenizerType.whitespace:
return [sentence.split() for sentence in sentences]
else:
logger.error("找不到对应的分词器")
exit(-1)
@classmethod
def norm(cls, sentences: List[str], lang: LangType = None):
if lang == LangType.zh:
from utils.speechio import textnorm_zh as textnorm
normalizer = textnorm.TextNorm(
to_banjiao=True,
to_upper=True,
to_lower=False,
remove_fillers=True,
remove_erhua=False, # 这里同批量识别不同,改成了 False
check_chars=False,
remove_space=False,
cc_mode="",
)
return [normalizer(sentence) for sentence in sentences]
elif lang == LangType.en:
pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
with open('./predict.txt', 'w', encoding='utf-8') as fp:
for idx, sentence in enumerate(sentences):
fp.write('%s\t%s\n' % (idx, sentence))
subprocess.run(
f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt',
shell=True,
check=True,
)
sentence_norm = []
with open('./predict_norm.txt', 'r', encoding='utf-8') as fp:
for line in fp.readlines():
line_split_result = line.strip().split('\t', 1)
if len(line_split_result) >= 2:
sentence_norm.append(line_split_result[1])
else:
sentence_norm.append("")
# 有可能没有 norm 后就没了
return sentence_norm
else:
punc = "!?。"#$%&'()*+,-/:;<=>[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏.`! #$%^&*()_+-=|';\":/.,?><~·!#¥%……&*()——+-=“:’;、。,?》《{}"
return [sentence.translate(str.maketrans(dict.fromkeys(punc, " "))).lower() for sentence in sentences]