#!/usr/bin/env python3 # coding=utf-8 # Copyright 2022 Ruiqi WANG, Jinpeng LI, Jiayu DU # # only tested and validated on pynini v2.1.5 via : 'conda install -c conda-forge pynini' # pynini v2.1.0 doesn't work # import argparse import os import string import sys from nemo_text_processing.text_normalization.normalize import Normalizer def read_interjections(filepath): interjections = [] with open(filepath) as f: for line in f: words = [x.strip() for x in line.split(',')] interjections += [w for w in words] + [w.upper() for w in words] + [w.lower() for w in words] return list(set(interjections)) # deduplicated if __name__ == '__main__': p = argparse.ArgumentParser() p.add_argument('ifile', help='input filename, assume utf-8 encoding') p.add_argument('ofile', help='output filename') p.add_argument('--to_upper', action='store_true', help='convert to upper case') p.add_argument('--to_lower', action='store_true', help='convert to lower case') p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.") p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines') args = p.parse_args() nemo_tn_en = Normalizer(input_case='lower_cased', lang='en') itj = read_interjections(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'interjections_en.csv')) itj_map = {x: True for x in itj} certain_single_quote_items = ["\"'", "'?", "'!", "'.", "?'", "!'", ".'", "''", "'", "'"] single_quote_removed_items = [x.replace("'", '') for x in certain_single_quote_items] puncts_to_remove = string.punctuation.replace("'", '') + "—–“”" puncts_trans = str.maketrans(puncts_to_remove, ' ' * len(puncts_to_remove), '') n = 0 with open(args.ifile, 'r', encoding='utf8') as fi, open(args.ofile, 'w+', encoding='utf8') as fo: for line in fi: if args.has_key: cols = line.strip().split(maxsplit=1) key, text = cols[0].strip(), cols[1].strip() if len(cols) == 2 else '' else: text = line.strip() text = text.replace("‘", "'").replace("’", "'") # nemo text normalization # modifications to NeMo: # 1. added UK to US conversion: nemo_text_processing/text_normalization/en/data/whitelist/UK_to_US.tsv # 2. swith 'oh' to 'o' in year TN to avoid confusion with interjections, e.g.: # 1805: eighteen oh five -> eighteen o five text = nemo_tn_en.normalize(text.lower()) # Punctuations # NOTE(2022.10 Jiayu): # Single quote removal is not perfect. # ' needs to be reserved for: # Abbreviations: # I'm, don't, she'd, 'cause, Sweet Child o' Mine, Guns N' Roses, ... # Possessions: # John's, the king's, parents', ... text = '' + text + '' for x, y in zip(certain_single_quote_items, single_quote_removed_items): text = text.replace(x, y) text = text.replace('', '').replace('', '') text = text.translate(puncts_trans).replace(" ' ", " ") # Interjections text = ' '.join([x for x in text.strip().split() if x not in itj_map]) # Cases if args.to_upper and args.to_lower: sys.stderr.write('text norm: to_upper OR to_lower?') exit(1) if args.to_upper: text = text.upper() if args.to_lower: text = text.lower() if args.has_key: print(key + '\t' + text, file=fo) else: print(text, file=fo) n += 1 if n % args.log_interval == 0: print(f'text norm: {n} lines done.', file=sys.stderr) print(f'text norm: {n} lines done in total.', file=sys.stderr)