enginex-bi_series-asr/utils/tokenizer.py

import os
import subprocess
from enum import Enum
from typing import List

from utils.logger import logger


class TokenizerType(str, Enum):
    word = "word"
    whitespace = "whitespace"


class LangType(str, Enum):
    zh = "zh"
    en = "en"


TOKENIZER_MAPPING = dict()
TOKENIZER_MAPPING['zh'] = TokenizerType.word
TOKENIZER_MAPPING['en'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ru'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ar'] = TokenizerType.whitespace
TOKENIZER_MAPPING['tr'] = TokenizerType.whitespace
TOKENIZER_MAPPING['es'] = TokenizerType.whitespace
TOKENIZER_MAPPING['pt'] = TokenizerType.whitespace
TOKENIZER_MAPPING['id'] = TokenizerType.whitespace
TOKENIZER_MAPPING['he'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ja'] = TokenizerType.word
TOKENIZER_MAPPING['pl'] = TokenizerType.whitespace
TOKENIZER_MAPPING['de'] = TokenizerType.whitespace
TOKENIZER_MAPPING['fr'] = TokenizerType.whitespace
TOKENIZER_MAPPING['nl'] = TokenizerType.whitespace
TOKENIZER_MAPPING['el'] = TokenizerType.whitespace
TOKENIZER_MAPPING['vi'] = TokenizerType.whitespace
TOKENIZER_MAPPING['th'] = TokenizerType.whitespace
TOKENIZER_MAPPING['it'] = TokenizerType.whitespace
TOKENIZER_MAPPING['fa'] = TokenizerType.whitespace
TOKENIZER_MAPPING['ti'] = TokenizerType.word

import nltk

import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


class Tokenizer:
    @classmethod
    def norm_and_tokenize(cls, sentences: List[str], lang: str = None):
        tokenizer = TOKENIZER_MAPPING.get(lang, None)
        sentences = cls.replace_general_punc(sentences, tokenizer)
        sentences = cls.norm(sentences, lang)
        return cls.tokenize(sentences, lang)

    @classmethod
    def tokenize(cls, sentences: List[str], lang: str = None):
        tokenizer = TOKENIZER_MAPPING.get(lang, None)
        # sentences = cls.replace_general_punc(sentences, tokenizer)
        if tokenizer == TokenizerType.word:
            return [[ch for ch in sentence] for sentence in sentences]
        elif tokenizer == TokenizerType.whitespace:
            return [re.findall(r"\w+", sentence.lower()) for sentence in sentences]
        else:
            logger.error("找不到对应的分词器")
            exit(-1)

    @classmethod
    def norm(cls, sentences: List[str], lang: LangType = None):
        if lang == "zh":
            from utils.speechio import textnorm_zh as textnorm

            normalizer = textnorm.TextNorm(
                to_banjiao=True,
                to_upper=True,
                to_lower=False,
                remove_fillers=True,
                remove_erhua=False,  # 这里同批量识别不同，改成了 False
                check_chars=False,
                remove_space=False,
                cc_mode="",
            )
            return [normalizer(sentence) for sentence in sentences]
        elif lang == "en":
            # pwd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            # with open('./predict.txt', 'w', encoding='utf-8') as fp:
            #     for idx, sentence in enumerate(sentences):
            #         fp.write('%s\t%s\n' % (idx, sentence))
            # subprocess.run(
            #     f'PYTHONPATH={pwd}/utils/speechio python {pwd}/utils/speechio/textnorm_en.py --has_key --to_upper ./predict.txt ./predict_norm.txt',
            #     shell=True,
            #     check=True,
            # )
            # sentence_norm = []
            # with open('./predict_norm.txt', 'r', encoding='utf-8') as fp:
            #     for line in fp.readlines():
            #         line_split_result = line.strip().split('\t', 1)
            #         if len(line_split_result) >= 2:
            #             sentence_norm.append(line_split_result[1])
            #         else:
            #             sentence_norm.append("")
            #         # 有可能没有 norm 后就没了
            # return sentence_norm

            # sentence_norm = []
            # for sentence in sentences:
            #     doc = _nlp_en(sentence)
            #     # 保留单词，去除标点、数字、特殊符号；做词形还原
            #     tokens = [token.lemma_ for token in doc if token.is_alpha]
            #     tokens = [t.upper() for t in tokens]  # 根据你的原逻辑 to_upper=True
            #     sentence_norm.append(" ".join(tokens))
            # return sentence_norm
            result = []
            for sentence in sentences:
                sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
                tokens = word_tokenize(sentence)
                tokens = [lemmatizer.lemmatize(t) for t in tokens]
                # if to_upper:
                #     tokens = [t.upper() for t in tokens]
                result.append(" ".join(tokens))
            return result
        else:
            punc = "！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏.`! #$%^&*()_+-=|';\":/.,?><~·！#￥%……&*（）——+-=“：’；、。，？》《{}"
            return [sentence.translate(str.maketrans(dict.fromkeys(punc, " "))).lower() for sentence in sentences]

    @classmethod
    def replace_general_punc(cls, sentences: List[str], tokenizer: TokenizerType,language:str = None) -> List[str]:
        """代替原来的函数 utils.metrics.cut_sentence"""
        if language:
            tokenizer = TOKENIZER_MAPPING.get(language)
        general_puncs = [
            "······",
            "......",
            "。",
            "，",
            "？",
            "！",
            "；",
            "：",
            "...",
            ".",
            ",",
            "?",
            "!",
            ";",
            ":",
        ]
        if tokenizer == TokenizerType.whitespace:
            replacer = " "
        else:
            replacer = ""
        trans = str.maketrans(dict.fromkeys("".join(general_puncs), replacer))
        ret_sentences = [""] * len(sentences)
        for i, sentence in enumerate(sentences):
            sentence = sentence.translate(trans)
            sentence = sentence.strip()
            sentence = sentence.lower()
            ret_sentences[i] = sentence
        return ret_sentences