Files
enginex-c_series-asr/utils/speechio/textnorm_en.py

101 lines
4.0 KiB
Python
Raw Permalink Normal View History

2025-08-28 18:46:56 +08:00
#!/usr/bin/env python3
# coding=utf-8
# Copyright 2022 Ruiqi WANG, Jinpeng LI, Jiayu DU
#
# only tested and validated on pynini v2.1.5 via : 'conda install -c conda-forge pynini'
# pynini v2.1.0 doesn't work
#
import argparse
import os
import string
import sys
from nemo_text_processing.text_normalization.normalize import Normalizer
def read_interjections(filepath):
interjections = []
with open(filepath) as f:
for line in f:
words = [x.strip() for x in line.split(',')]
interjections += [w for w in words] + [w.upper() for w in words] + [w.lower() for w in words]
return list(set(interjections)) # deduplicated
if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('ifile', help='input filename, assume utf-8 encoding')
p.add_argument('ofile', help='output filename')
p.add_argument('--to_upper', action='store_true', help='convert to upper case')
p.add_argument('--to_lower', action='store_true', help='convert to lower case')
p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines')
args = p.parse_args()
nemo_tn_en = Normalizer(input_case='lower_cased', lang='en')
itj = read_interjections(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'interjections_en.csv'))
itj_map = {x: True for x in itj}
certain_single_quote_items = ["\"'", "'?", "'!", "'.", "?'", "!'", ".'", "''", "<BOS>'", "'<EOS>"]
single_quote_removed_items = [x.replace("'", '') for x in certain_single_quote_items]
puncts_to_remove = string.punctuation.replace("'", '') + "—–“”"
puncts_trans = str.maketrans(puncts_to_remove, ' ' * len(puncts_to_remove), '')
n = 0
with open(args.ifile, 'r', encoding='utf8') as fi, open(args.ofile, 'w+', encoding='utf8') as fo:
for line in fi:
if args.has_key:
cols = line.strip().split(maxsplit=1)
key, text = cols[0].strip(), cols[1].strip() if len(cols) == 2 else ''
else:
text = line.strip()
text = text.replace("", "'").replace("", "'")
# nemo text normalization
# modifications to NeMo:
# 1. added UK to US conversion: nemo_text_processing/text_normalization/en/data/whitelist/UK_to_US.tsv
# 2. swith 'oh' to 'o' in year TN to avoid confusion with interjections, e.g.:
# 1805: eighteen oh five -> eighteen o five
text = nemo_tn_en.normalize(text.lower())
# Punctuations
# NOTE(2022.10 Jiayu):
# Single quote removal is not perfect.
# ' needs to be reserved for:
# Abbreviations:
# I'm, don't, she'd, 'cause, Sweet Child o' Mine, Guns N' Roses, ...
# Possessions:
# John's, the king's, parents', ...
text = '<BOS>' + text + '<EOS>'
for x, y in zip(certain_single_quote_items, single_quote_removed_items):
text = text.replace(x, y)
text = text.replace('<BOS>', '').replace('<EOS>', '')
text = text.translate(puncts_trans).replace(" ' ", " ")
# Interjections
text = ' '.join([x for x in text.strip().split() if x not in itj_map])
# Cases
if args.to_upper and args.to_lower:
sys.stderr.write('text norm: to_upper OR to_lower?')
exit(1)
if args.to_upper:
text = text.upper()
if args.to_lower:
text = text.lower()
if args.has_key:
print(key + '\t' + text, file=fo)
else:
print(text, file=fo)
n += 1
if n % args.log_interval == 0:
print(f'text norm: {n} lines done.', file=sys.stderr)
print(f'text norm: {n} lines done in total.', file=sys.stderr)