This commit is contained in:
zhousha
2025-08-06 15:38:55 +08:00
parent 4916ad0fe0
commit 55a67e817e
193 changed files with 51647 additions and 1 deletions

View File

@@ -0,0 +1,3 @@
'''
reference: https://github.com/SpeechColab/Leaderboard/tree/f287a992dc359d1c021bfc6ce810e5e36608e057/utils
'''

View File

@@ -0,0 +1,551 @@
#!/usr/bin/env python3
# coding=utf8
# Copyright 2022 Zhenxiang MA, Jiayu DU (SpeechColab)
import argparse
import csv
import json
import logging
import os
import sys
from typing import Iterable
logging.basicConfig(stream=sys.stderr, level=logging.ERROR, format='[%(levelname)s] %(message)s')
import pynini
from pynini.lib import pynutil
# reference: https://github.com/kylebgorman/pynini/blob/master/pynini/lib/edit_transducer.py
# to import original lib:
# from pynini.lib.edit_transducer import EditTransducer
class EditTransducer:
DELETE = "<delete>"
INSERT = "<insert>"
SUBSTITUTE = "<substitute>"
def __init__(
self,
symbol_table,
vocab: Iterable[str],
insert_cost: float = 1.0,
delete_cost: float = 1.0,
substitute_cost: float = 1.0,
bound: int = 0,
):
# Left factor; note that we divide the edit costs by two because they also
# will be incurred when traversing the right factor.
sigma = pynini.union(
*[pynini.accep(token, token_type=symbol_table) for token in vocab],
).optimize()
insert = pynutil.insert(f"[{self.INSERT}]", weight=insert_cost / 2)
delete = pynini.cross(sigma, pynini.accep(f"[{self.DELETE}]", weight=delete_cost / 2))
substitute = pynini.cross(sigma, pynini.accep(f"[{self.SUBSTITUTE}]", weight=substitute_cost / 2))
edit = pynini.union(insert, delete, substitute).optimize()
if bound:
sigma_star = pynini.closure(sigma)
self._e_i = sigma_star.copy()
for _ in range(bound):
self._e_i.concat(edit.ques).concat(sigma_star)
else:
self._e_i = edit.union(sigma).closure()
self._e_i.optimize()
right_factor_std = EditTransducer._right_factor(self._e_i)
# right_factor_ext allows 0-cost matching between token's raw form & auxiliary form
# e.g.: 'I' -> 'I#', 'AM' -> 'AM#'
right_factor_ext = (
pynini.union(
*[
pynini.cross(
pynini.accep(x, token_type=symbol_table),
pynini.accep(x + '#', token_type=symbol_table),
)
for x in vocab
]
)
.optimize()
.closure()
)
self._e_o = pynini.union(right_factor_std, right_factor_ext).closure().optimize()
@staticmethod
def _right_factor(ifst: pynini.Fst) -> pynini.Fst:
ofst = pynini.invert(ifst)
syms = pynini.generated_symbols()
insert_label = syms.find(EditTransducer.INSERT)
delete_label = syms.find(EditTransducer.DELETE)
pairs = [(insert_label, delete_label), (delete_label, insert_label)]
right_factor = ofst.relabel_pairs(ipairs=pairs)
return right_factor
def create_lattice(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.Fst:
lattice = (iexpr @ self._e_i) @ (self._e_o @ oexpr)
EditTransducer.check_wellformed_lattice(lattice)
return lattice
@staticmethod
def check_wellformed_lattice(lattice: pynini.Fst) -> None:
if lattice.start() == pynini.NO_STATE_ID:
raise RuntimeError("Edit distance composition lattice is empty.")
def compute_distance(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> float:
lattice = self.create_lattice(iexpr, oexpr)
# The shortest cost from all final states to the start state is
# equivalent to the cost of the shortest path.
start = lattice.start()
return float(pynini.shortestdistance(lattice, reverse=True)[start])
def compute_alignment(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.FstLike:
print(iexpr)
print(oexpr)
lattice = self.create_lattice(iexpr, oexpr)
alignment = pynini.shortestpath(lattice, nshortest=1, unique=True)
return alignment.optimize()
class ErrorStats:
def __init__(self):
self.num_ref_utts = 0
self.num_hyp_utts = 0
self.num_eval_utts = 0 # in both ref & hyp
self.num_hyp_without_ref = 0
self.C = 0
self.S = 0
self.I = 0
self.D = 0
self.token_error_rate = 0.0
self.modified_token_error_rate = 0.0
self.num_utts_with_error = 0
self.sentence_error_rate = 0.0
def to_json(self):
# return json.dumps(self.__dict__, indent=4)
return json.dumps(self.__dict__)
def to_kaldi(self):
info = (
F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
)
return info
def to_summary(self):
summary = (
'==================== Overall Statistics ====================\n'
F'num_ref_utts: {self.num_ref_utts}\n'
F'num_hyp_utts: {self.num_hyp_utts}\n'
F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
F'num_eval_utts: {self.num_eval_utts}\n'
F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
F'token_error_rate: {self.token_error_rate:.2f}%\n'
F'modified_token_error_rate: {self.modified_token_error_rate:.2f}%\n'
F'token_stats:\n'
F' - tokens:{self.C + self.S + self.D:>7}\n'
F' - edits: {self.S + self.I + self.D:>7}\n'
F' - cor: {self.C:>7}\n'
F' - sub: {self.S:>7}\n'
F' - ins: {self.I:>7}\n'
F' - del: {self.D:>7}\n'
'============================================================\n'
)
return summary
class Utterance:
def __init__(self, uid, text):
self.uid = uid
self.text = text
def LoadKaldiArc(filepath):
utts = {}
with open(filepath, 'r', encoding='utf8') as f:
for line in f:
line = line.strip()
if line:
cols = line.split(maxsplit=1)
assert len(cols) == 2 or len(cols) == 1
uid = cols[0]
text = cols[1] if len(cols) == 2 else ''
if utts.get(uid) != None:
raise RuntimeError(F'Found duplicated utterence id {uid}')
utts[uid] = Utterance(uid, text)
return utts
def BreakHyphen(token: str):
# 'T-SHIRT' should also introduce new words into vocabulary, e.g.:
# 1. 'T' & 'SHIRT'
# 2. 'TSHIRT'
assert '-' in token
v = token.split('-')
v.append(token.replace('-', ''))
return v
def LoadGLM(rel_path):
'''
glm.csv:
I'VE,I HAVE
GOING TO,GONNA
...
T-SHIRT,T SHIRT,TSHIRT
glm:
{
'<RULE_00000>': ["I'VE", 'I HAVE'],
'<RULE_00001>': ['GOING TO', 'GONNA'],
...
'<RULE_99999>': ['T-SHIRT', 'T SHIRT', 'TSHIRT'],
}
'''
logging.info(f'Loading GLM from {rel_path} ...')
abs_path = os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
reader = list(csv.reader(open(abs_path, encoding="utf-8"), delimiter=','))
glm = {}
for k, rule in enumerate(reader):
rule_name = f'<RULE_{k:06d}>'
glm[rule_name] = [phrase.strip() for phrase in rule]
logging.info(f' #rule: {len(glm)}')
return glm
def SymbolEQ(symbol_table, i1, i2):
return symbol_table.find(i1).strip('#') == symbol_table.find(i2).strip('#')
def PrintSymbolTable(symbol_table: pynini.SymbolTable):
print('SYMBOL_TABLE:')
for k in range(symbol_table.num_symbols()):
sym = symbol_table.find(k)
assert symbol_table.find(sym) == k # symbol table's find can be used for bi-directional lookup (id <-> sym)
print(k, sym)
print()
def BuildSymbolTable(vocab) -> pynini.SymbolTable:
logging.info('Building symbol table ...')
symbol_table = pynini.SymbolTable()
symbol_table.add_symbol('<epsilon>')
for w in vocab:
symbol_table.add_symbol(w)
logging.info(f' #symbols: {symbol_table.num_symbols()}')
# PrintSymbolTable(symbol_table)
# symbol_table.write_text('symbol_table.txt')
return symbol_table
def BuildGLMTagger(glm, symbol_table) -> pynini.Fst:
logging.info('Building GLM tagger ...')
rule_taggers = []
for rule_tag, rule in glm.items():
for phrase in rule:
rule_taggers.append(
(
pynutil.insert(pynini.accep(rule_tag, token_type=symbol_table))
+ pynini.accep(phrase, token_type=symbol_table)
+ pynutil.insert(pynini.accep(rule_tag, token_type=symbol_table))
)
)
alphabet = pynini.union(
*[pynini.accep(sym, token_type=symbol_table) for k, sym in symbol_table if k != 0] # non-epsilon
).optimize()
tagger = pynini.cdrewrite(
pynini.union(*rule_taggers).optimize(), '', '', alphabet.closure()
).optimize() # could be slow with large vocabulary
return tagger
def TokenWidth(token: str):
def CharWidth(c):
return 2 if (c >= '\u4e00') and (c <= '\u9fa5') else 1
return sum([CharWidth(c) for c in token])
def PrintPrettyAlignment(raw_hyp, edit_ali, ref_ali, hyp_ali, stream=sys.stderr):
assert len(edit_ali) == len(ref_ali) and len(ref_ali) == len(hyp_ali)
H = ' HYP# : '
R = ' REF : '
E = ' EDIT : '
for i, e in enumerate(edit_ali):
h, r = hyp_ali[i], ref_ali[i]
e = '' if e == 'C' else e # don't bother printing correct edit-tag
nr, nh, ne = TokenWidth(r), TokenWidth(h), TokenWidth(e)
n = max(nr, nh, ne) + 1
H += h + ' ' * (n - nh)
R += r + ' ' * (n - nr)
E += e + ' ' * (n - ne)
print(F' HYP : {raw_hyp}', file=stream)
print(H, file=stream)
print(R, file=stream)
print(E, file=stream)
def ComputeTokenErrorRate(c, s, i, d):
assert (s + d + c) != 0
num_edits = s + d + i
ref_len = c + s + d
hyp_len = c + s + i
return 100.0 * num_edits / ref_len, 100.0 * num_edits / max(ref_len, hyp_len)
def ComputeSentenceErrorRate(num_err_utts, num_utts):
assert num_utts != 0
return 100.0 * num_err_utts / num_utts
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--logk', type=int, default=500, help='logging interval')
parser.add_argument(
'--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER'
)
parser.add_argument('--glm', type=str, default='glm_en.csv', help='glm')
parser.add_argument('--ref', type=str, required=True, help='reference kaldi arc file')
parser.add_argument('--hyp', type=str, required=True, help='hypothesis kaldi arc file')
parser.add_argument('result_file', type=str)
args = parser.parse_args()
logging.info(args)
stats = ErrorStats()
logging.info('Generating tokenizer ...')
if args.tokenizer == 'whitespace':
def word_tokenizer(text):
return text.strip().split()
tokenizer = word_tokenizer
elif args.tokenizer == 'char':
def char_tokenizer(text):
return [c for c in text.strip().replace(' ', '')]
tokenizer = char_tokenizer
else:
tokenizer = None
assert tokenizer
logging.info('Loading REF & HYP ...')
ref_utts = LoadKaldiArc(args.ref)
hyp_utts = LoadKaldiArc(args.hyp)
# check valid utterances in hyp that have matched non-empty reference
uids = []
for uid in sorted(hyp_utts.keys()):
if uid in ref_utts.keys():
if ref_utts[uid].text.strip(): # non-empty reference
uids.append(uid)
else:
logging.warning(F'Found {uid} with empty reference, skipping...')
else:
logging.warning(F'Found {uid} without reference, skipping...')
stats.num_hyp_without_ref += 1
stats.num_hyp_utts = len(hyp_utts)
stats.num_ref_utts = len(ref_utts)
stats.num_eval_utts = len(uids)
logging.info(f' #hyp:{stats.num_hyp_utts}, #ref:{stats.num_ref_utts}, #utts_to_evaluate:{stats.num_eval_utts}')
print(f' #hyp:{stats.num_hyp_utts}, #ref:{stats.num_ref_utts}, #utts_to_evaluate:{stats.num_eval_utts}')
tokens = []
for uid in uids:
ref_tokens = tokenizer(ref_utts[uid].text)
hyp_tokens = tokenizer(hyp_utts[uid].text)
for t in ref_tokens + hyp_tokens:
tokens.append(t)
if '-' in t:
tokens.extend(BreakHyphen(t))
vocab_from_utts = list(set(tokens))
logging.info(f' HYP&REF vocab size: {len(vocab_from_utts)}')
print(f' HYP&REF vocab size: {len(vocab_from_utts)}')
assert args.glm
glm = LoadGLM(args.glm)
tokens = []
for rule in glm.values():
for phrase in rule:
for t in tokenizer(phrase):
tokens.append(t)
if '-' in t:
tokens.extend(BreakHyphen(t))
vocab_from_glm = list(set(tokens))
logging.info(f' GLM vocab size: {len(vocab_from_glm)}')
print(f' GLM vocab size: {len(vocab_from_glm)}')
vocab = list(set(vocab_from_utts + vocab_from_glm))
logging.info(f'Global vocab size: {len(vocab)}')
print(f'Global vocab size: {len(vocab)}')
symtab = BuildSymbolTable(
# Normal evaluation vocab + auxiliary form for alternative paths + GLM tags
vocab
+ [x + '#' for x in vocab]
+ [x for x in glm.keys()]
)
glm_tagger = BuildGLMTagger(glm, symtab)
edit_transducer = EditTransducer(symbol_table=symtab, vocab=vocab)
print(edit_transducer)
logging.info('Evaluating error rate ...')
print('Evaluating error rate ...')
fo = open(args.result_file, 'w+', encoding='utf8')
ndone = 0
for uid in uids:
ref = ref_utts[uid].text
raw_hyp = hyp_utts[uid].text
ref_fst = pynini.accep(' '.join(tokenizer(ref)), token_type=symtab)
print(ref_fst)
# print(ref_fst.string(token_type = symtab))
raw_hyp_fst = pynini.accep(' '.join(tokenizer(raw_hyp)), token_type=symtab)
# print(raw_hyp_fst.string(token_type = symtab))
# Say, we have:
# RULE_001: "I'M" <-> "I AM"
# REF: HEY I AM HERE
# HYP: HEY I'M HERE
#
# We want to expand HYP with GLM rules(marked with auxiliary #)
# HYP#: HEY {I'M | I# AM#} HERE
# REF is honored to keep its original form.
#
# This could be considered as a flexible on-the-fly TN towards HYP.
# 1. GLM rule tagging:
# HEY I'M HERE
# ->
# HEY <RULE_001> I'M <RULE_001> HERE
lattice = (raw_hyp_fst @ glm_tagger).optimize()
tagged_ir = pynini.shortestpath(lattice, nshortest=1, unique=True).string(token_type=symtab)
# print(hyp_tagged)
# 2. GLM rule expansion:
# HEY <RULE_001> I'M <RULE_001> HERE
# ->
# sausage-like fst: HEY {I'M | I# AM#} HERE
tokens = tagged_ir.split()
sausage = pynini.accep('', token_type=symtab)
i = 0
while i < len(tokens): # invariant: tokens[0, i) has been built into fst
forms = []
if tokens[i].startswith('<RULE_') and tokens[i].endswith('>'): # rule segment
rule_name = tokens[i]
rule = glm[rule_name]
# pre-condition: i -> ltag
raw_form = ''
for j in range(i + 1, len(tokens)):
if tokens[j] == rule_name:
raw_form = ' '.join(tokens[i + 1 : j])
break
assert raw_form
# post-condition: i -> ltag, j -> rtag
forms.append(raw_form)
for phrase in rule:
if phrase != raw_form:
forms.append(' '.join([x + '#' for x in phrase.split()]))
i = j + 1
else: # normal token segment
token = tokens[i]
forms.append(token)
if "-" in token: # token with hyphen yields extra forms
forms.append(' '.join([x + '#' for x in token.split('-')])) # 'T-SHIRT' -> 'T# SHIRT#'
forms.append(token.replace('-', '') + '#') # 'T-SHIRT' -> 'TSHIRT#'
i += 1
sausage_segment = pynini.union(*[pynini.accep(x, token_type=symtab) for x in forms]).optimize()
sausage += sausage_segment
hyp_fst = sausage.optimize()
print(hyp_fst)
# Utterance-Level error rate evaluation
alignment = edit_transducer.compute_alignment(ref_fst, hyp_fst)
print("alignment", alignment)
distance = 0.0
C, S, I, D = 0, 0, 0, 0 # Cor, Sub, Ins, Del
edit_ali, ref_ali, hyp_ali = [], [], []
for state in alignment.states():
for arc in alignment.arcs(state):
i, o = arc.ilabel, arc.olabel
if i != 0 and o != 0 and SymbolEQ(symtab, i, o):
e = 'C'
r, h = symtab.find(i), symtab.find(o)
C += 1
distance += 0.0
elif i != 0 and o != 0 and not SymbolEQ(symtab, i, o):
e = 'S'
r, h = symtab.find(i), symtab.find(o)
S += 1
distance += 1.0
elif i == 0 and o != 0:
e = 'I'
r, h = '*', symtab.find(o)
I += 1
distance += 1.0
elif i != 0 and o == 0:
e = 'D'
r, h = symtab.find(i), '*'
D += 1
distance += 1.0
else:
raise RuntimeError
edit_ali.append(e)
ref_ali.append(r)
hyp_ali.append(h)
# assert(distance == edit_transducer.compute_distance(ref_fst, sausage))
utt_ter, utt_mter = ComputeTokenErrorRate(C, S, I, D)
# print(F'{{"uid":{uid}, "score":{-distance}, "TER":{utt_ter:.2f}, "mTER":{utt_mter:.2f}, "cor":{C}, "sub":{S}, "ins":{I}, "del":{D}}}', file=fo)
# PrintPrettyAlignment(raw_hyp, edit_ali, ref_ali, hyp_ali, fo)
if utt_ter > 0:
stats.num_utts_with_error += 1
stats.C += C
stats.S += S
stats.I += I
stats.D += D
ndone += 1
if ndone % args.logk == 0:
logging.info(f'{ndone} utts evaluated.')
logging.info(f'{ndone} utts evaluated in total.')
# Corpus-Level evaluation
stats.token_error_rate, stats.modified_token_error_rate = ComputeTokenErrorRate(stats.C, stats.S, stats.I, stats.D)
stats.sentence_error_rate = ComputeSentenceErrorRate(stats.num_utts_with_error, stats.num_eval_utts)
print(stats.to_json(), file=fo)
# print(stats.to_kaldi())
# print(stats.to_summary(), file=fo)
fo.close()

View File

@@ -0,0 +1,370 @@
#!/usr/bin/env python3
# coding=utf8
# Copyright 2021 Jiayu DU
import sys
import argparse
import json
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='[%(levelname)s] %(message)s')
DEBUG = None
def GetEditType(ref_token, hyp_token):
if ref_token == None and hyp_token != None:
return 'I'
elif ref_token != None and hyp_token == None:
return 'D'
elif ref_token == hyp_token:
return 'C'
elif ref_token != hyp_token:
return 'S'
else:
raise RuntimeError
class AlignmentArc:
def __init__(self, src, dst, ref, hyp):
self.src = src
self.dst = dst
self.ref = ref
self.hyp = hyp
self.edit_type = GetEditType(ref, hyp)
def similarity_score_function(ref_token, hyp_token):
return 0 if (ref_token == hyp_token) else -1.0
def insertion_score_function(token):
return -1.0
def deletion_score_function(token):
return -1.0
def EditDistance(
ref,
hyp,
similarity_score_function = similarity_score_function,
insertion_score_function = insertion_score_function,
deletion_score_function = deletion_score_function):
assert(len(ref) != 0)
class DPState:
def __init__(self):
self.score = -float('inf')
# backpointer
self.prev_r = None
self.prev_h = None
def print_search_grid(S, R, H, fstream):
print(file=fstream)
for r in range(R):
for h in range(H):
print(F'[{r},{h}]:{S[r][h].score:4.3f}:({S[r][h].prev_r},{S[r][h].prev_h}) ', end='', file=fstream)
print(file=fstream)
R = len(ref) + 1
H = len(hyp) + 1
# Construct DP search space, a (R x H) grid
S = [ [] for r in range(R) ]
for r in range(R):
S[r] = [ DPState() for x in range(H) ]
# initialize DP search grid origin, S(r = 0, h = 0)
S[0][0].score = 0.0
S[0][0].prev_r = None
S[0][0].prev_h = None
# initialize REF axis
for r in range(1, R):
S[r][0].score = S[r-1][0].score + deletion_score_function(ref[r-1])
S[r][0].prev_r = r-1
S[r][0].prev_h = 0
# initialize HYP axis
for h in range(1, H):
S[0][h].score = S[0][h-1].score + insertion_score_function(hyp[h-1])
S[0][h].prev_r = 0
S[0][h].prev_h = h-1
best_score = S[0][0].score
best_state = (0, 0)
for r in range(1, R):
for h in range(1, H):
sub_or_cor_score = similarity_score_function(ref[r-1], hyp[h-1])
new_score = S[r-1][h-1].score + sub_or_cor_score
if new_score >= S[r][h].score:
S[r][h].score = new_score
S[r][h].prev_r = r-1
S[r][h].prev_h = h-1
del_score = deletion_score_function(ref[r-1])
new_score = S[r-1][h].score + del_score
if new_score >= S[r][h].score:
S[r][h].score = new_score
S[r][h].prev_r = r - 1
S[r][h].prev_h = h
ins_score = insertion_score_function(hyp[h-1])
new_score = S[r][h-1].score + ins_score
if new_score >= S[r][h].score:
S[r][h].score = new_score
S[r][h].prev_r = r
S[r][h].prev_h = h-1
best_score = S[R-1][H-1].score
best_state = (R-1, H-1)
if DEBUG:
print_search_grid(S, R, H, sys.stderr)
# Backtracing best alignment path, i.e. a list of arcs
# arc = (src, dst, ref, hyp, edit_type)
# src/dst = (r, h), where r/h refers to search grid state-id along Ref/Hyp axis
best_path = []
r, h = best_state[0], best_state[1]
prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
score = S[r][h].score
# loop invariant:
# 1. (prev_r, prev_h) -> (r, h) is a "forward arc" on best alignment path
# 2. score is the value of point(r, h) on DP search grid
while prev_r != None or prev_h != None:
src = (prev_r, prev_h)
dst = (r, h)
if (r == prev_r + 1 and h == prev_h + 1): # Substitution or correct
arc = AlignmentArc(src, dst, ref[prev_r], hyp[prev_h])
elif (r == prev_r + 1 and h == prev_h): # Deletion
arc = AlignmentArc(src, dst, ref[prev_r], None)
elif (r == prev_r and h == prev_h + 1): # Insertion
arc = AlignmentArc(src, dst, None, hyp[prev_h])
else:
raise RuntimeError
best_path.append(arc)
r, h = prev_r, prev_h
prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
score = S[r][h].score
best_path.reverse()
return (best_path, best_score)
def PrettyPrintAlignment(alignment, stream = sys.stderr):
def get_token_str(token):
if token == None:
return "*"
return token
def is_double_width_char(ch):
if (ch >= '\u4e00') and (ch <= '\u9fa5'): # codepoint ranges for Chinese chars
return True
# TODO: support other double-width-char language such as Japanese, Korean
else:
return False
def display_width(token_str):
m = 0
for c in token_str:
if is_double_width_char(c):
m += 2
else:
m += 1
return m
R = ' REF : '
H = ' HYP : '
E = ' EDIT : '
for arc in alignment:
r = get_token_str(arc.ref)
h = get_token_str(arc.hyp)
e = arc.edit_type if arc.edit_type != 'C' else ''
nr, nh, ne = display_width(r), display_width(h), display_width(e)
n = max(nr, nh, ne) + 1
R += r + ' ' * (n-nr)
H += h + ' ' * (n-nh)
E += e + ' ' * (n-ne)
print(R, file=stream)
print(H, file=stream)
print(E, file=stream)
def CountEdits(alignment):
c, s, i, d = 0, 0, 0, 0
for arc in alignment:
if arc.edit_type == 'C':
c += 1
elif arc.edit_type == 'S':
s += 1
elif arc.edit_type == 'I':
i += 1
elif arc.edit_type == 'D':
d += 1
else:
raise RuntimeError
return (c, s, i, d)
def ComputeTokenErrorRate(c, s, i, d):
return 100.0 * (s + d + i) / (s + d + c)
def ComputeSentenceErrorRate(num_err_utts, num_utts):
assert(num_utts != 0)
return 100.0 * num_err_utts / num_utts
class EvaluationResult:
def __init__(self):
self.num_ref_utts = 0
self.num_hyp_utts = 0
self.num_eval_utts = 0 # seen in both ref & hyp
self.num_hyp_without_ref = 0
self.C = 0
self.S = 0
self.I = 0
self.D = 0
self.token_error_rate = 0.0
self.num_utts_with_error = 0
self.sentence_error_rate = 0.0
def to_json(self):
return json.dumps(self.__dict__)
def to_kaldi(self):
info = (
F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
)
return info
def to_sclite(self):
return "TODO"
def to_espnet(self):
return "TODO"
def to_summary(self):
#return json.dumps(self.__dict__, indent=4)
summary = (
'==================== Overall Statistics ====================\n'
F'num_ref_utts: {self.num_ref_utts}\n'
F'num_hyp_utts: {self.num_hyp_utts}\n'
F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
F'num_eval_utts: {self.num_eval_utts}\n'
F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
F'token_error_rate: {self.token_error_rate:.2f}%\n'
F'token_stats:\n'
F' - tokens:{self.C + self.S + self.D:>7}\n'
F' - edits: {self.S + self.I + self.D:>7}\n'
F' - cor: {self.C:>7}\n'
F' - sub: {self.S:>7}\n'
F' - ins: {self.I:>7}\n'
F' - del: {self.D:>7}\n'
'============================================================\n'
)
return summary
class Utterance:
def __init__(self, uid, text):
self.uid = uid
self.text = text
def LoadUtterances(filepath, format):
utts = {}
if format == 'text': # utt_id word1 word2 ...
with open(filepath, 'r', encoding='utf8') as f:
for line in f:
line = line.strip()
if line:
cols = line.split(maxsplit=1)
assert(len(cols) == 2 or len(cols) == 1)
uid = cols[0]
text = cols[1] if len(cols) == 2 else ''
if utts.get(uid) != None:
raise RuntimeError(F'Found duplicated utterence id {uid}')
utts[uid] = Utterance(uid, text)
else:
raise RuntimeError(F'Unsupported text format {format}')
return utts
def tokenize_text(text, tokenizer):
if tokenizer == 'whitespace':
return text.split()
elif tokenizer == 'char':
return [ ch for ch in ''.join(text.split()) ]
else:
raise RuntimeError(F'ERROR: Unsupported tokenizer {tokenizer}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# optional
parser.add_argument('--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER')
parser.add_argument('--ref-format', choices=['text'], default='text', help='reference format, first col is utt_id, the rest is text')
parser.add_argument('--hyp-format', choices=['text'], default='text', help='hypothesis format, first col is utt_id, the rest is text')
# required
parser.add_argument('--ref', type=str, required=True, help='input reference file')
parser.add_argument('--hyp', type=str, required=True, help='input hypothesis file')
parser.add_argument('result_file', type=str)
args = parser.parse_args()
logging.info(args)
ref_utts = LoadUtterances(args.ref, args.ref_format)
hyp_utts = LoadUtterances(args.hyp, args.hyp_format)
r = EvaluationResult()
# check valid utterances in hyp that have matched non-empty reference
eval_utts = []
r.num_hyp_without_ref = 0
for uid in sorted(hyp_utts.keys()):
if uid in ref_utts.keys(): # TODO: efficiency
if ref_utts[uid].text.strip(): # non-empty reference
eval_utts.append(uid)
else:
logging.warn(F'Found {uid} with empty reference, skipping...')
else:
logging.warn(F'Found {uid} without reference, skipping...')
r.num_hyp_without_ref += 1
r.num_hyp_utts = len(hyp_utts)
r.num_ref_utts = len(ref_utts)
r.num_eval_utts = len(eval_utts)
with open(args.result_file, 'w+', encoding='utf8') as fo:
for uid in eval_utts:
ref = ref_utts[uid]
hyp = hyp_utts[uid]
alignment, score = EditDistance(
tokenize_text(ref.text, args.tokenizer),
tokenize_text(hyp.text, args.tokenizer)
)
c, s, i, d = CountEdits(alignment)
utt_ter = ComputeTokenErrorRate(c, s, i, d)
# utt-level evaluation result
print(F'{{"uid":{uid}, "score":{score}, "ter":{utt_ter:.2f}, "cor":{c}, "sub":{s}, "ins":{i}, "del":{d}}}', file=fo)
PrettyPrintAlignment(alignment, fo)
r.C += c
r.S += s
r.I += i
r.D += d
if utt_ter > 0:
r.num_utts_with_error += 1
# corpus level evaluation result
r.sentence_error_rate = ComputeSentenceErrorRate(r.num_utts_with_error, r.num_eval_utts)
r.token_error_rate = ComputeTokenErrorRate(r.C, r.S, r.I, r.D)
print(r.to_summary(), file=fo)
print(r.to_json())
print(r.to_kaldi())

744
utils/speechio/glm_en.csv Normal file
View File

@@ -0,0 +1,744 @@
I'M,I AM
I'LL,I WILL
I'D,I HAD
I'VE,I HAVE
I WOULD'VE,I'D HAVE
YOU'RE,YOU ARE
YOU'LL,YOU WILL
YOU'D,YOU WOULD
YOU'VE,YOU HAVE
HE'S,HE IS,HE WAS
HE'LL,HE WILL
HE'D,HE HAD
SHE'S,SHE IS,SHE WAS
SHE'LL,SHE WILL
SHE'D,SHE HAD
IT'S,IT IS,IT WAS
IT'LL,IT WILL
WE'RE,WE ARE,WE WERE
WE'LL,WE WILL
WE'D,WE WOULD
WE'VE,WE HAVE
WHO'LL,WHO WILL
THEY'RE,THEY ARE
THEY'LL,THEY WILL
THAT'S,THAT IS,THAT WAS
THAT'LL,THAT WILL
HERE'S,HERE IS,HERE WAS
THERE'S,THERE IS,THERE WAS
WHERE'S,WHERE IS,WHERE WAS
WHAT'S,WHAT IS,WHAT WAS
LET'S,LET US
WHO'S,WHO IS
ONE'S,ONE IS
THERE'LL,THERE WILL
SOMEBODY'S,SOMEBODY IS
EVERYBODY'S,EVERYBODY IS
WOULD'VE,WOULD HAVE
CAN'T,CANNOT,CAN NOT
HADN'T,HAD NOT
HASN'T,HAS NOT
HAVEN'T,HAVE NOT
ISN'T,IS NOT
AREN'T,ARE NOT
WON'T,WILL NOT
WOULDN'T,WOULD NOT
SHOULDN'T,SHOULD NOT
DON'T,DO NOT
DIDN'T,DID NOT
GOTTA,GOT TO
GONNA,GOING TO
WANNA,WANT TO
LEMME,LET ME
GIMME,GIVE ME
DUNNO,DON'T KNOW
GOTCHA,GOT YOU
KINDA,KIND OF
MYSELF,MY SELF
YOURSELF,YOUR SELF
HIMSELF,HIM SELF
HERSELF,HER SELF
ITSELF,IT SELF
OURSELVES,OUR SELVES
OKAY,OK,O K
Y'ALL,YALL,YOU ALL
'CAUSE,'COS,CUZ,BECAUSE
FUCKIN',FUCKING
KILLING,KILLIN'
EVERYDAY,EVERY DAY
DOCTOR,DR,DR.
MRS,MISSES,MISSUS
MR,MR.,MISTER
SR,SR.,SENIOR
JR,JR.,JUNIOR
ST,ST.,SAINT
VOL,VOL.,VOLUME
CM,CENTIMETER,CENTIMETRE
MM,MILLIMETER,MILLIMETRE
KM,KILOMETER,KILOMETRE
KB,KILOBYTES,KILO BYTES,K B
MB,MEGABYTES,MEGA BYTES
GB,GIGABYTES,GIGA BYTES,G B
THOUSAND,THOUSAND AND
HUNDRED,HUNDRED AND
A HUNDRED,ONE HUNDRED
TWO THOUSAND AND,TWENTY,TWO THOUSAND
STORYTELLER,STORY TELLER
TSHIRT,T SHIRT
TSHIRTS,T SHIRTS
LEUKAEMIA,LEUKEMIA
OESTROGEN,ESTROGEN
ACKNOWLEDGMENT,ACKNOWLEDGEMENT
JUDGMENT,JUDGEMENT
MAMMA,MAMA
DINING,DINNING
FLACK,FLAK
LEARNT,LEARNED
BLONDE,BLOND
JUMPSTART,JUMP START
RIGHTNOW,RIGHT NOW
EVERYONE,EVERY ONE
NAME'S,NAME IS
FAMILY'S,FAMILY IS
COMPANY'S,COMPANY HAS
GRANDKID,GRAND KID
GRANDKIDS,GRAND KIDS
MEALTIMES,MEAL TIMES
ALRIGHT,ALL RIGHT
GROWNUP,GROWN UP
GROWNUPS,GROWN UPS
SCHOOLDAYS,SCHOOL DAYS
SCHOOLCHILDREN,SCHOOL CHILDREN
CASEBOOK,CASE BOOK
HUNGOVER,HUNG OVER
HANDCLAPS,HAND CLAPS
HANDCLAP,HAND CLAP
HEATWAVE,HEAT WAVE
ADDON,ADD ON
ONTO,ON TO
INTO,IN TO
GOTO,GO TO
GUNSHOT,GUN SHOT
MOTHERFUCKER,MOTHER FUCKER
OFTENTIMES,OFTEN TIMES
SARTRE'S,SARTRE IS
NONSTARTER,NON STARTER
NONSTARTERS,NON STARTERS
LONGTIME,LONG TIME
POLICYMAKERS,POLICY MAKERS
ANYMORE,ANY MORE
CANADA'S,CANADA IS
CELLPHONE,CELL PHONE
WORKPLACE,WORK PLACE
UNDERESTIMATING,UNDER ESTIMATING
CYBERSECURITY,CYBER SECURITY
NORTHEAST,NORTH EAST
ANYTIME,ANY TIME
LIVESTREAM,LIVE STREAM
LIVESTREAMS,LIVE STREAMS
WEBCAM,WEB CAM
EMAIL,E MAIL
ECAM,E CAM
VMIX,V MIX
SETUP,SET UP
SMARTPHONE,SMART PHONE
MULTICASTING,MULTI CASTING
CHITCHAT,CHIT CHAT
SEMIFINAL,SEMI FINAL
SEMIFINALS,SEMI FINALS
BBQ,BARBECUE
STORYLINE,STORY LINE
STORYLINES,STORY LINES
BRO,BROTHER
BROS,BROTHERS
OVERPROTECTIIVE,OVER PROTECTIVE
TIMEOUT,TIME OUT
ADVISOR,ADVISER
TIMBERWOLVES,TIMBER WOLVES
WEBPAGE,WEB PAGE
NEWCOMER,NEW COMER
DELMAR,DEL MAR
NETPLAY,NET PLAY
STREETSIDE,STREET SIDE
COLOURED,COLORED
COLOURFUL,COLORFUL
O,ZERO
ETCETERA,ET CETERA
FUNDRAISING,FUND RAISING
RAINFOREST,RAIN FOREST
BREATHTAKING,BREATH TAKING
WIKIPAGE,WIKI PAGE
OVERTIME,OVER TIME
TRAIN'S TRAIN IS
ANYONE,ANY ONE
PHYSIOTHERAPY,PHYSIO THERAPY
ANYBODY,ANY BODY
BOTTLECAPS,BOTTLE CAPS
BOTTLECAP,BOTTLE CAP
STEPFATHER'S,STEP FATHER'S
STEPFATHER,STEP FATHER
WARTIME,WAR TIME
SCREENSHOT,SCREEN SHOT
TIMELINE,TIME LINE
CITY'S,CITY IS
NONPROFIT,NON PROFIT
KPOP,K POP
HOMEBASE,HOME BASE
LIFELONG,LIFE LONG
LAWSUITS,LAW SUITS
MULTIBILLION,MULTI BILLION
ROADMAP,ROAD MAP
GUY'S,GUY IS
CHECKOUT,CHECK OUT
SQUARESPACE,SQUARE SPACE
REDLINING,RED LINING
BASE'S,BASE IS
TAKEAWAY,TAKE AWAY
CANDYLAND,CANDY LAND
ANTISOCIAL,ANTI SOCIAL
CASEWORK,CASE WORK
RIGOR,RIGOUR
ORGANIZATIONS,ORGANISATIONS
ORGANIZATION,ORGANISATION
SIGNPOST,SIGN POST
WWII,WORLD WAR TWO
WINDOWPANE,WINDOW PANE
SUREFIRE,SURE FIRE
MOUNTAINTOP,MOUNTAIN TOP
SALESPERSON,SALES PERSON
NETWORK,NET WORK
MINISERIES,MINI SERIES
EDWARDS'S,EDWARDS IS
INTERSUBJECTIVITY,INTER SUBJECTIVITY
LIBERALISM'S,LIBERALISM IS
TAGLINE,TAG LINE
SHINETHEORY,SHINE THEORY
CALLYOURGIRLFRIEND,CALL YOUR GIRLFRIEND
STARTUP,START UP
BREAKUP,BREAK UP
RADIOTOPIA,RADIO TOPIA
HEARTBREAKING,HEART BREAKING
AUTOIMMUNE,AUTO IMMUNE
SINISE'S,SINISE IS
KICKBACK,KICK BACK
FOGHORN,FOG HORN
BADASS,BAD ASS
POWERAMERICAFORWARD,POWER AMERICA FORWARD
GOOGLE'S,GOOGLE IS
ROLEPLAY,ROLE PLAY
PRICE'S,PRICE IS
STANDOFF,STAND OFF
FOREVER,FOR EVER
GENERAL'S,GENERAL IS
DOG'S,DOG IS
AUDIOBOOK,AUDIO BOOK
ANYWAY,ANY WAY
PIGEONHOLE,PIEGON HOLE
EGGSHELLS,EGG SHELLS
VACCINE'S,VACCINE IS
WORKOUT,WORK OUT
ADMINISTRATOR'S,ADMINISTRATOR IS
FUCKUP,FUCK UP
RUNOFFS,RUN OFFS
COLORWAY,COLOR WAY
WAITLIST,WAIT LIST
HEALTHCARE,HEALTH CARE
TEXTBOOK,TEXT BOOK
CALLBACK,CALL BACK
PARTYGOERS,PARTY GOERS
SOMEDAY,SOME DAY
NIGHTGOWN,NIGHT GOWN
STANDALONG,STAND ALONG
BUSSINESSWOMAN,BUSSINESS WOMAN
STORYTELLING,STORY TELLING
MARKETPLACE,MARKET PLACE
CRATEJOY,CRATE JOY
OUTPERFORMED,OUT PERFORMED
TRUEBOTANICALS,TRUE BOTANICALS
NONFICTION,NON FICTION
SPINOFF,SPIN OFF
MOTHERFUCKING,MOTHER FUCKING
TRACKLIST,TRACK LIST
GODDAMN,GOD DAMN
PORNHUB,PORN HUB
UNDERAGE,UNDER AGE
GOODBYE,GOOD BYE
HARDCORE,HARD CORE
TRUCK'S,TRUCK IS
COUNTERSTEERING,COUNTER STEERING
BUZZWORD,BUZZ WORD
SUBCOMPONENTS,SUB COMPONENTS
MOREOVER,MORE OVER
PICKUP,PICK UP
NEWSLETTER,NEWS LETTER
KEYWORD,KEY WORD
LOGIN,LOG IN
TOOLBOX,TOOL BOX
LINK'S,LINK IS
PRIMIALVIDEO,PRIMAL VIDEO
DOTNET,DOT NET
AIRSTRIKE,AIR STRIKE
HAIRSTYLE,HAIR STYLE
TOWNSFOLK,TOWNS FOLK
GOLDFISH,GOLD FISH
TOM'S,TOM IS
HOMETOWN,HOME TOWN
CORONAVIRUS,CORONA VIRUS
PLAYSTATION,PLAY STATION
TOMORROW,TO MORROW
TIMECONSUMING,TIME CONSUMING
POSTWAR,POST WAR
HANDSON,HANDS ON
SHAKEUP,SHAKE UP
ECOMERS,E COMERS
COFOUNDER,CO FOUNDER
HIGHEND,HIGH END
INPERSON,IN PERSON
GROWNUP,GROWN UP
SELFREGULATION,SELF REGULATION
INDEPTH,IN DEPTH
ALLTIME,ALL TIME
LONGTERM,LONG TERM
SOCALLED,SO CALLED
SELFCONFIDENCE,SELF CONFIDENCE
STANDUP,STAND UP
MINDBOGGLING,MIND BOGGLING
BEINGFOROTHERS,BEING FOR OTHERS
COWROTE,CO WROTE
COSTARRED,CO STARRED
EDITORINCHIEF,EDITOR IN CHIEF
HIGHSPEED,HIGH SPEED
DECISIONMAKING,DECISION MAKING
WELLBEING,WELL BEING
NONTRIVIAL,NON TRIVIAL
PREEXISTING,PRE EXISTING
STATEOWNED,STATE OWNED
PLUGIN,PLUG IN
PROVERSION,PRO VERSION
OPTIN,OPT IN
FOLLOWUP,FOLLOW UP
FOLLOWUPS,FOLLOW UPS
WIFI,WI FI
THIRDPARTY,THIRD PARTY
PROFESSIONALLOOKING,PROFESSIONAL LOOKING
FULLSCREEN,FULL SCREEN
BUILTIN,BUILT IN
MULTISTREAM,MULTI STREAM
LOWCOST,LOW COST
RESTREAM,RE STREAM
GAMECHANGER,GAME CHANGER
WELLDEVELOPED,WELL DEVELOPED
QUARTERINCH,QUARTER INCH
FASTFASHION,FAST FASHION
ECOMMERCE,E COMMERCE
PRIZEWINNING,PRIZE WINNING
NEVERENDING,NEVER ENDING
MINDBLOWING,MIND BLOWING
REALLIFE,REAL LIFE
REOPEN,RE OPEN
ONDEMAND,ON DEMAND
PROBLEMSOLVING,PROBLEM SOLVING
HEAVYHANDED,HEAVY HANDED
OPENENDED,OPEN ENDED
SELFCONTROL,SELF CONTROL
WELLMEANING,WELL MEANING
COHOST,CO HOST
RIGHTSBASED,RIGHTS BASED
HALFBROTHER,HALF BROTHER
FATHERINLAW,FATHER IN LAW
COAUTHOR,CO AUTHOR
REELECTION,RE ELECTION
SELFHELP,SELF HELP
PROLIFE,PRO LIFE
ANTIDUKE,ANTI DUKE
POSTSTRUCTURALIST,POST STRUCTURALIST
COFOUNDED,CO FOUNDED
XRAY,X RAY
ALLAROUND,ALL AROUND
HIGHTECH,HIGH TECH
TMOBILE,T MOBILE
INHOUSE,IN HOUSE
POSTMORTEM,POST MORTEM
LITTLEKNOWN,LITTLE KNOWN
FALSEPOSITIVE,FALSE POSITIVE
ANTIVAXXER,ANTI VAXXER
EMAILS,E MAILS
DRIVETHROUGH,DRIVE THROUGH
DAYTODAY,DAY TO DAY
COSTAR,CO STAR
EBAY,E BAY
KOOLAID,KOOL AID
ANTIDEMOCRATIC,ANTI DEMOCRATIC
MIDDLEAGED,MIDDLE AGED
SHORTLIVED,SHORT LIVED
BESTSELLING,BEST SELLING
TICTACS,TIC TACS
UHHUH,UH HUH
MULTITANK,MULTI TANK
JAWDROPPING,JAW DROPPING
LIVESTREAMING,LIVE STREAMING
HARDWORKING,HARD WORKING
BOTTOMDWELLING,BOTTOM DWELLING
PRESHOW,PRE SHOW
HANDSFREE,HANDS FREE
TRICKORTREATING,TRICK OR TREATING
PRERECORDED,PRE RECORDED
DOGOODERS,DO GOODERS
WIDERANGING,WIDE RANGING
LIFESAVING,LIFE SAVING
SKIREPORT,SKI REPORT
SNOWBASE,SNOW BASE
JAYZ,JAY Z
SPIDERMAN,SPIDER MAN
FREEKICK,FREE KICK
EDWARDSHELAIRE,EDWARDS HELAIRE
SHORTTERM,SHORT TERM
HAVENOTS,HAVE NOTS
SELFINTEREST,SELF INTEREST
SELFINTERESTED,SELF INTERESTED
SELFCOMPASSION,SELF COMPASSION
MACHINELEARNING,MACHINE LEARNING
COAUTHORED,CO AUTHORED
NONGOVERNMENT,NON GOVERNMENT
SUBSAHARAN,SUB SAHARAN
COCHAIR,CO CHAIR
LARGESCALE,LARGE SCALE
VIDEOONDEMAND,VIDEO ON DEMAND
FIRSTCLASS,FIRST CLASS
COFOUNDERS,CO FOUNDERS
COOP,CO OP
PREORDERS,PRE ORDERS
DOUBLEENTRY,DOUBLE ENTRY
SELFCONFIDENT,SELF CONFIDENT
SELFPORTRAIT,SELF PORTRAIT
NONWHITE,NON WHITE
ONBOARD,ON BOARD
HALFLIFE,HALF LIFE
ONCOURT,ON COURT
SCIFI,SCI FI
XMEN,X MEN
DAYLEWIS,DAY LEWIS
LALALAND,LA LA LAND
AWARDWINNING,AWARD WINNING
BOXOFFICE,BOX OFFICE
TRIDACTYLS,TRI DACTYLS
TRIDACTYL,TRI DACTYL
MEDIUMSIZED,MEDIUM SIZED
POSTSECONDARY,POST SECONDARY
FULLTIME,FULL TIME
GOKART,GO KART
OPENAIR,OPEN AIR
WELLKNOWN,WELL KNOWN
ICECREAM,ICE CREAM
EARTHMOON,EARTH MOON
STATEOFTHEART,STATE OF THE ART
BSIDE,B SIDE
EASTWEST,EAST WEST
ALLSTAR,ALL STAR
RUNNERUP,RUNNER UP
HORSEDRAWN,HORSE DRAWN
OPENSOURCE,OPEN SOURCE
PURPOSEBUILT,PURPOSE BUILT
SQUAREFREE,SQUARE FREE
PRESENTDAY,PRESENT DAY
CANADAUNITED,CANADA UNITED
HOTCHPOTCH,HOTCH POTCH
LOWLYING,LOW LYING
RIGHTHANDED,RIGHT HANDED
PEARSHAPED,PEAR SHAPED
BESTKNOWN,BEST KNOWN
FULLLENGTH,FULL LENGTH
YEARROUND,YEAR ROUND
PREELECTION,PRE ELECTION
RERECORD,RE RECORD
MINIALBUM,MINI ALBUM
LONGESTRUNNING,LONGEST RUNNING
ALLIRELAND,ALL IRELAND
NORTHWESTERN,NORTH WESTERN
PARTTIME,PART TIME
NONGOVERNMENTAL,NON GOVERNMENTAL
ONLINE,ON LINE
ONAIR,ON AIR
NORTHSOUTH,NORTH SOUTH
RERELEASED,RE RELEASED
LEFTHANDED,LEFT HANDED
BSIDES,B SIDES
ANGLOSAXON,ANGLO SAXON
SOUTHSOUTHEAST,SOUTH SOUTHEAST
CROSSCOUNTRY,CROSS COUNTRY
REBUILT,RE BUILT
FREEFORM,FREE FORM
SCOOBYDOO,SCOOBY DOO
ATLARGE,AT LARGE
COUNCILMANAGER,COUNCIL MANAGER
LONGRUNNING,LONG RUNNING
PREWAR,PRE WAR
REELECTED,RE ELECTED
HIGHSCHOOL,HIGH SCHOOL
RUNNERSUP,RUNNERS UP
NORTHWEST,NORTH WEST
WEBBASED,WEB BASED
HIGHQUALITY,HIGH QUALITY
RIGHTWING,RIGHT WING
LANEFOX,LANE FOX
PAYPERVIEW,PAY PER VIEW
COPRODUCTION,CO PRODUCTION
NONPARTISAN,NON PARTISAN
FIRSTPERSON,FIRST PERSON
WORLDRENOWNED,WORLD RENOWNED
VICEPRESIDENT,VICE PRESIDENT
PROROMAN,PRO ROMAN
COPRODUCED,CO PRODUCED
LOWPOWER,LOW POWER
SELFESTEEM,SELF ESTEEM
SEMITRANSPARENT,SEMI TRANSPARENT
SECONDINCOMMAND,SECOND IN COMMAND
HIGHRISE,HIGH RISE
COHOSTED,CO HOSTED
AFRICANAMERICAN,AFRICAN AMERICAN
SOUTHWEST,SOUTH WEST
WELLPRESERVED,WELL PRESERVED
FEATURELENGTH,FEATURE LENGTH
HIPHOP,HIP HOP
ALLBIG,ALL BIG
SOUTHEAST,SOUTH EAST
COUNTERATTACK,COUNTER ATTACK
QUARTERFINALS,QUARTER FINALS
STABLEDOOR,STABLE DOOR
DARKEYED,DARK EYED
ALLAMERICAN,ALL AMERICAN
THIRDPERSON,THIRD PERSON
LOWLEVEL,LOW LEVEL
NTERMINAL,N TERMINAL
DRIEDUP,DRIED UP
AFRICANAMERICANS,AFRICAN AMERICANS
ANTIAPARTHEID,ANTI APARTHEID
STOKEONTRENT,STOKE ON TRENT
NORTHNORTHEAST,NORTH NORTHEAST
BRANDNEW,BRAND NEW
RIGHTANGLED,RIGHT ANGLED
GOVERNMENTOWNED,GOVERNMENT OWNED
SONINLAW,SON IN LAW
SUBJECTOBJECTVERB,SUBJECT OBJECT VERB
LEFTARM,LEFT ARM
LONGLIVED,LONG LIVED
REDEYE,RED EYE
TPOSE,T POSE
NIGHTVISION,NIGHT VISION
SOUTHEASTERN,SOUTH EASTERN
WELLRECEIVED,WELL RECEIVED
ALFAYOUM,AL FAYOUM
TIMEBASED,TIME BASED
KETTLEDRUMS,KETTLE DRUMS
BRIGHTEYED,BRIGHT EYED
REDBROWN,RED BROWN
SAMESEX,SAME SEX
PORTDEPAIX,PORT DE PAIX
CLEANUP,CLEAN UP
PERCENT,PERCENT SIGN
TAKEOUT,TAKE OUT
KNOWHOW,KNOW HOW
FISHBONE,FISH BONE
FISHSTICKS,FISH STICKS
PAPERWORK,PAPER WORK
NICKNACKS,NICK NACKS
STREETTALKING,STREET TALKING
NONACADEMIC,NON ACADEMIC
SHELLY,SHELLEY
SHELLY'S,SHELLEY'S
JIMMY,JIMMIE
JIMMY'S,JIMMIE'S
DRUGSTORE,DRUG STORE
THRU,THROUGH
PLAYDATE,PLAY DATE
MICROLIFE,MICRO LIFE
SKILLSET,SKILL SET
SKILLSETS,SKILL SETS
TRADEOFF,TRADE OFF
TRADEOFFS,TRADE OFFS
ONSCREEN,ON SCREEN
PLAYBACK,PLAY BACK
ARTWORK,ART WORK
COWORKER,CO WORDER
COWORKERS,CO WORDERS
SOMETIME,SOME TIME
SOMETIMES,SOME TIMES
CROWDFUNDING,CROWD FUNDING
AM,A.M.,A M
PM,P.M.,P M
TV,T V
MBA,M B A
USA,U S A
US,U S
UK,U K
CEO,C E O
CFO,C F O
COO,C O O
CIO,C I O
FM,F M
GMC,G M C
FSC,F S C
NPD,N P D
APM,A P M
NGO,N G O
TD,T D
LOL,L O L
IPO,I P O
CNBC,C N B C
IPOS,I P OS
CNBC's,C N B C'S
JT,J T
NPR,N P R
NPR'S,N P R'S
MP,M P
IOI,I O I
DW,D W
CNN,C N N
WSM,W S M
ET,E T
IT,I T
RJ,R J
DVD,D V D
DVD'S,D V D'S
HBO,H B O
LA,L A
XC,X C
SUV,S U V
NBA,N B A
NBA'S,N B A'S
ESPN,E S P N
ESPN'S,E S P N'S
ADT,A D T
HD,H D
VIP,V I P
TMZ,T M Z
CBC,C B C
NPO,N P O
BBC,B B C
LA'S,L A'S
TMZ'S,T M Z'S
HIV,H I V
FTC,F T C
EU,E U
PHD,P H D
AI,A I
FHI,F H I
ICML,I C M L
ICLR,I C L R
BMW,B M W
EV,E V
CR,C R
API,A P I
ICO,I C O
LTE,L T E
OBS,O B S
PC,P C
IO,I O
CRM,C R M
RTMP,R T M P
ASMR,A S M R
GG,G G
WWW,W W W
PEI,P E I
JJ,J J
PT,P T
DJ,D J
SD,S D
POW,P.O.W.,P O W
FYI,F Y I
DC,D C,D.C
ABC,A B C
TJ,T J
WMDT,W M D T
WDTN,W D T N
TY,T Y
EJ,E J
CJ,C J
ACL,A C L
UK'S,U K'S
GTV,G T V
MDMA,M D M A
DFW,D F W
WTF,W T F
AJ,A J
MD,M D
PH,P H
ID,I D
SEO,S E O
UTM'S,U T M'S
EC,E C
UFC,U F C
RV,R V
UTM,U T M
CSV,C S V
SMS,S M S
GRB,G R B
GT,G T
LEM,L E M
XR,X R
EDU,E D U
NBC,N B C
EMS,E M S
CDC,C D C
MLK,M L K
IE,I E
OC,O C
HR,H R
MA,M A
DEE,D E E
AP,A P
UFO,U F O
DE,D E
LGBTQ,L G B T Q
PTA,P T A
NHS,N H S
CMA,C M A
MGM,M G M
AKA,A K A
HW,H W
GOP,G O P
GOP'S,G O P'S
FBI,F B I
PRX,P R X
CTO,C T O
URL,U R L
EIN,E I N
MLS,M L S
CSI,C S I
AOC,A O C
CND,C N D
CP,C P
PP,P P
CLI,C L I
PB,P B
FDA,F D A
MRNA,M R N A
PR,P R
VP,V P
DNC,D N C
MSNBC,M S N B C
GQ,G Q
UT,U T
XXI,X X I
HRV,H R V
WHO,W H O
CRO,C R O
DPA,D P A
PPE,P P E
EVA,E V A
BP,B P
GPS,G P S
AR,A R
PJ,P J
MLM,M L M
OLED,O L E D
BO,B O
VE,V E
UN,U N
SLS,S L S
DM,D M
DM'S,D M'S
ASAP,A S A P
ETA,E T A
DOB,D O B
BMW,B M W
1 I'M,I AM
2 I'LL,I WILL
3 I'D,I HAD
4 I'VE,I HAVE
5 I WOULD'VE,I'D HAVE
6 YOU'RE,YOU ARE
7 YOU'LL,YOU WILL
8 YOU'D,YOU WOULD
9 YOU'VE,YOU HAVE
10 HE'S,HE IS,HE WAS
11 HE'LL,HE WILL
12 HE'D,HE HAD
13 SHE'S,SHE IS,SHE WAS
14 SHE'LL,SHE WILL
15 SHE'D,SHE HAD
16 IT'S,IT IS,IT WAS
17 IT'LL,IT WILL
18 WE'RE,WE ARE,WE WERE
19 WE'LL,WE WILL
20 WE'D,WE WOULD
21 WE'VE,WE HAVE
22 WHO'LL,WHO WILL
23 THEY'RE,THEY ARE
24 THEY'LL,THEY WILL
25 THAT'S,THAT IS,THAT WAS
26 THAT'LL,THAT WILL
27 HERE'S,HERE IS,HERE WAS
28 THERE'S,THERE IS,THERE WAS
29 WHERE'S,WHERE IS,WHERE WAS
30 WHAT'S,WHAT IS,WHAT WAS
31 LET'S,LET US
32 WHO'S,WHO IS
33 ONE'S,ONE IS
34 THERE'LL,THERE WILL
35 SOMEBODY'S,SOMEBODY IS
36 EVERYBODY'S,EVERYBODY IS
37 WOULD'VE,WOULD HAVE
38 CAN'T,CANNOT,CAN NOT
39 HADN'T,HAD NOT
40 HASN'T,HAS NOT
41 HAVEN'T,HAVE NOT
42 ISN'T,IS NOT
43 AREN'T,ARE NOT
44 WON'T,WILL NOT
45 WOULDN'T,WOULD NOT
46 SHOULDN'T,SHOULD NOT
47 DON'T,DO NOT
48 DIDN'T,DID NOT
49 GOTTA,GOT TO
50 GONNA,GOING TO
51 WANNA,WANT TO
52 LEMME,LET ME
53 GIMME,GIVE ME
54 DUNNO,DON'T KNOW
55 GOTCHA,GOT YOU
56 KINDA,KIND OF
57 MYSELF,MY SELF
58 YOURSELF,YOUR SELF
59 HIMSELF,HIM SELF
60 HERSELF,HER SELF
61 ITSELF,IT SELF
62 OURSELVES,OUR SELVES
63 OKAY,OK,O K
64 Y'ALL,YALL,YOU ALL
65 'CAUSE,'COS,CUZ,BECAUSE
66 FUCKIN',FUCKING
67 KILLING,KILLIN'
68 EVERYDAY,EVERY DAY
69 DOCTOR,DR,DR.
70 MRS,MISSES,MISSUS
71 MR,MR.,MISTER
72 SR,SR.,SENIOR
73 JR,JR.,JUNIOR
74 ST,ST.,SAINT
75 VOL,VOL.,VOLUME
76 CM,CENTIMETER,CENTIMETRE
77 MM,MILLIMETER,MILLIMETRE
78 KM,KILOMETER,KILOMETRE
79 KB,KILOBYTES,KILO BYTES,K B
80 MB,MEGABYTES,MEGA BYTES
81 GB,GIGABYTES,GIGA BYTES,G B
82 THOUSAND,THOUSAND AND
83 HUNDRED,HUNDRED AND
84 A HUNDRED,ONE HUNDRED
85 TWO THOUSAND AND,TWENTY,TWO THOUSAND
86 STORYTELLER,STORY TELLER
87 TSHIRT,T SHIRT
88 TSHIRTS,T SHIRTS
89 LEUKAEMIA,LEUKEMIA
90 OESTROGEN,ESTROGEN
91 ACKNOWLEDGMENT,ACKNOWLEDGEMENT
92 JUDGMENT,JUDGEMENT
93 MAMMA,MAMA
94 DINING,DINNING
95 FLACK,FLAK
96 LEARNT,LEARNED
97 BLONDE,BLOND
98 JUMPSTART,JUMP START
99 RIGHTNOW,RIGHT NOW
100 EVERYONE,EVERY ONE
101 NAME'S,NAME IS
102 FAMILY'S,FAMILY IS
103 COMPANY'S,COMPANY HAS
104 GRANDKID,GRAND KID
105 GRANDKIDS,GRAND KIDS
106 MEALTIMES,MEAL TIMES
107 ALRIGHT,ALL RIGHT
108 GROWNUP,GROWN UP
109 GROWNUPS,GROWN UPS
110 SCHOOLDAYS,SCHOOL DAYS
111 SCHOOLCHILDREN,SCHOOL CHILDREN
112 CASEBOOK,CASE BOOK
113 HUNGOVER,HUNG OVER
114 HANDCLAPS,HAND CLAPS
115 HANDCLAP,HAND CLAP
116 HEATWAVE,HEAT WAVE
117 ADDON,ADD ON
118 ONTO,ON TO
119 INTO,IN TO
120 GOTO,GO TO
121 GUNSHOT,GUN SHOT
122 MOTHERFUCKER,MOTHER FUCKER
123 OFTENTIMES,OFTEN TIMES
124 SARTRE'S,SARTRE IS
125 NONSTARTER,NON STARTER
126 NONSTARTERS,NON STARTERS
127 LONGTIME,LONG TIME
128 POLICYMAKERS,POLICY MAKERS
129 ANYMORE,ANY MORE
130 CANADA'S,CANADA IS
131 CELLPHONE,CELL PHONE
132 WORKPLACE,WORK PLACE
133 UNDERESTIMATING,UNDER ESTIMATING
134 CYBERSECURITY,CYBER SECURITY
135 NORTHEAST,NORTH EAST
136 ANYTIME,ANY TIME
137 LIVESTREAM,LIVE STREAM
138 LIVESTREAMS,LIVE STREAMS
139 WEBCAM,WEB CAM
140 EMAIL,E MAIL
141 ECAM,E CAM
142 VMIX,V MIX
143 SETUP,SET UP
144 SMARTPHONE,SMART PHONE
145 MULTICASTING,MULTI CASTING
146 CHITCHAT,CHIT CHAT
147 SEMIFINAL,SEMI FINAL
148 SEMIFINALS,SEMI FINALS
149 BBQ,BARBECUE
150 STORYLINE,STORY LINE
151 STORYLINES,STORY LINES
152 BRO,BROTHER
153 BROS,BROTHERS
154 OVERPROTECTIIVE,OVER PROTECTIVE
155 TIMEOUT,TIME OUT
156 ADVISOR,ADVISER
157 TIMBERWOLVES,TIMBER WOLVES
158 WEBPAGE,WEB PAGE
159 NEWCOMER,NEW COMER
160 DELMAR,DEL MAR
161 NETPLAY,NET PLAY
162 STREETSIDE,STREET SIDE
163 COLOURED,COLORED
164 COLOURFUL,COLORFUL
165 O,ZERO
166 ETCETERA,ET CETERA
167 FUNDRAISING,FUND RAISING
168 RAINFOREST,RAIN FOREST
169 BREATHTAKING,BREATH TAKING
170 WIKIPAGE,WIKI PAGE
171 OVERTIME,OVER TIME
172 TRAIN'S TRAIN IS
173 ANYONE,ANY ONE
174 PHYSIOTHERAPY,PHYSIO THERAPY
175 ANYBODY,ANY BODY
176 BOTTLECAPS,BOTTLE CAPS
177 BOTTLECAP,BOTTLE CAP
178 STEPFATHER'S,STEP FATHER'S
179 STEPFATHER,STEP FATHER
180 WARTIME,WAR TIME
181 SCREENSHOT,SCREEN SHOT
182 TIMELINE,TIME LINE
183 CITY'S,CITY IS
184 NONPROFIT,NON PROFIT
185 KPOP,K POP
186 HOMEBASE,HOME BASE
187 LIFELONG,LIFE LONG
188 LAWSUITS,LAW SUITS
189 MULTIBILLION,MULTI BILLION
190 ROADMAP,ROAD MAP
191 GUY'S,GUY IS
192 CHECKOUT,CHECK OUT
193 SQUARESPACE,SQUARE SPACE
194 REDLINING,RED LINING
195 BASE'S,BASE IS
196 TAKEAWAY,TAKE AWAY
197 CANDYLAND,CANDY LAND
198 ANTISOCIAL,ANTI SOCIAL
199 CASEWORK,CASE WORK
200 RIGOR,RIGOUR
201 ORGANIZATIONS,ORGANISATIONS
202 ORGANIZATION,ORGANISATION
203 SIGNPOST,SIGN POST
204 WWII,WORLD WAR TWO
205 WINDOWPANE,WINDOW PANE
206 SUREFIRE,SURE FIRE
207 MOUNTAINTOP,MOUNTAIN TOP
208 SALESPERSON,SALES PERSON
209 NETWORK,NET WORK
210 MINISERIES,MINI SERIES
211 EDWARDS'S,EDWARDS IS
212 INTERSUBJECTIVITY,INTER SUBJECTIVITY
213 LIBERALISM'S,LIBERALISM IS
214 TAGLINE,TAG LINE
215 SHINETHEORY,SHINE THEORY
216 CALLYOURGIRLFRIEND,CALL YOUR GIRLFRIEND
217 STARTUP,START UP
218 BREAKUP,BREAK UP
219 RADIOTOPIA,RADIO TOPIA
220 HEARTBREAKING,HEART BREAKING
221 AUTOIMMUNE,AUTO IMMUNE
222 SINISE'S,SINISE IS
223 KICKBACK,KICK BACK
224 FOGHORN,FOG HORN
225 BADASS,BAD ASS
226 POWERAMERICAFORWARD,POWER AMERICA FORWARD
227 GOOGLE'S,GOOGLE IS
228 ROLEPLAY,ROLE PLAY
229 PRICE'S,PRICE IS
230 STANDOFF,STAND OFF
231 FOREVER,FOR EVER
232 GENERAL'S,GENERAL IS
233 DOG'S,DOG IS
234 AUDIOBOOK,AUDIO BOOK
235 ANYWAY,ANY WAY
236 PIGEONHOLE,PIEGON HOLE
237 EGGSHELLS,EGG SHELLS
238 VACCINE'S,VACCINE IS
239 WORKOUT,WORK OUT
240 ADMINISTRATOR'S,ADMINISTRATOR IS
241 FUCKUP,FUCK UP
242 RUNOFFS,RUN OFFS
243 COLORWAY,COLOR WAY
244 WAITLIST,WAIT LIST
245 HEALTHCARE,HEALTH CARE
246 TEXTBOOK,TEXT BOOK
247 CALLBACK,CALL BACK
248 PARTYGOERS,PARTY GOERS
249 SOMEDAY,SOME DAY
250 NIGHTGOWN,NIGHT GOWN
251 STANDALONG,STAND ALONG
252 BUSSINESSWOMAN,BUSSINESS WOMAN
253 STORYTELLING,STORY TELLING
254 MARKETPLACE,MARKET PLACE
255 CRATEJOY,CRATE JOY
256 OUTPERFORMED,OUT PERFORMED
257 TRUEBOTANICALS,TRUE BOTANICALS
258 NONFICTION,NON FICTION
259 SPINOFF,SPIN OFF
260 MOTHERFUCKING,MOTHER FUCKING
261 TRACKLIST,TRACK LIST
262 GODDAMN,GOD DAMN
263 PORNHUB,PORN HUB
264 UNDERAGE,UNDER AGE
265 GOODBYE,GOOD BYE
266 HARDCORE,HARD CORE
267 TRUCK'S,TRUCK IS
268 COUNTERSTEERING,COUNTER STEERING
269 BUZZWORD,BUZZ WORD
270 SUBCOMPONENTS,SUB COMPONENTS
271 MOREOVER,MORE OVER
272 PICKUP,PICK UP
273 NEWSLETTER,NEWS LETTER
274 KEYWORD,KEY WORD
275 LOGIN,LOG IN
276 TOOLBOX,TOOL BOX
277 LINK'S,LINK IS
278 PRIMIALVIDEO,PRIMAL VIDEO
279 DOTNET,DOT NET
280 AIRSTRIKE,AIR STRIKE
281 HAIRSTYLE,HAIR STYLE
282 TOWNSFOLK,TOWNS FOLK
283 GOLDFISH,GOLD FISH
284 TOM'S,TOM IS
285 HOMETOWN,HOME TOWN
286 CORONAVIRUS,CORONA VIRUS
287 PLAYSTATION,PLAY STATION
288 TOMORROW,TO MORROW
289 TIMECONSUMING,TIME CONSUMING
290 POSTWAR,POST WAR
291 HANDSON,HANDS ON
292 SHAKEUP,SHAKE UP
293 ECOMERS,E COMERS
294 COFOUNDER,CO FOUNDER
295 HIGHEND,HIGH END
296 INPERSON,IN PERSON
297 GROWNUP,GROWN UP
298 SELFREGULATION,SELF REGULATION
299 INDEPTH,IN DEPTH
300 ALLTIME,ALL TIME
301 LONGTERM,LONG TERM
302 SOCALLED,SO CALLED
303 SELFCONFIDENCE,SELF CONFIDENCE
304 STANDUP,STAND UP
305 MINDBOGGLING,MIND BOGGLING
306 BEINGFOROTHERS,BEING FOR OTHERS
307 COWROTE,CO WROTE
308 COSTARRED,CO STARRED
309 EDITORINCHIEF,EDITOR IN CHIEF
310 HIGHSPEED,HIGH SPEED
311 DECISIONMAKING,DECISION MAKING
312 WELLBEING,WELL BEING
313 NONTRIVIAL,NON TRIVIAL
314 PREEXISTING,PRE EXISTING
315 STATEOWNED,STATE OWNED
316 PLUGIN,PLUG IN
317 PROVERSION,PRO VERSION
318 OPTIN,OPT IN
319 FOLLOWUP,FOLLOW UP
320 FOLLOWUPS,FOLLOW UPS
321 WIFI,WI FI
322 THIRDPARTY,THIRD PARTY
323 PROFESSIONALLOOKING,PROFESSIONAL LOOKING
324 FULLSCREEN,FULL SCREEN
325 BUILTIN,BUILT IN
326 MULTISTREAM,MULTI STREAM
327 LOWCOST,LOW COST
328 RESTREAM,RE STREAM
329 GAMECHANGER,GAME CHANGER
330 WELLDEVELOPED,WELL DEVELOPED
331 QUARTERINCH,QUARTER INCH
332 FASTFASHION,FAST FASHION
333 ECOMMERCE,E COMMERCE
334 PRIZEWINNING,PRIZE WINNING
335 NEVERENDING,NEVER ENDING
336 MINDBLOWING,MIND BLOWING
337 REALLIFE,REAL LIFE
338 REOPEN,RE OPEN
339 ONDEMAND,ON DEMAND
340 PROBLEMSOLVING,PROBLEM SOLVING
341 HEAVYHANDED,HEAVY HANDED
342 OPENENDED,OPEN ENDED
343 SELFCONTROL,SELF CONTROL
344 WELLMEANING,WELL MEANING
345 COHOST,CO HOST
346 RIGHTSBASED,RIGHTS BASED
347 HALFBROTHER,HALF BROTHER
348 FATHERINLAW,FATHER IN LAW
349 COAUTHOR,CO AUTHOR
350 REELECTION,RE ELECTION
351 SELFHELP,SELF HELP
352 PROLIFE,PRO LIFE
353 ANTIDUKE,ANTI DUKE
354 POSTSTRUCTURALIST,POST STRUCTURALIST
355 COFOUNDED,CO FOUNDED
356 XRAY,X RAY
357 ALLAROUND,ALL AROUND
358 HIGHTECH,HIGH TECH
359 TMOBILE,T MOBILE
360 INHOUSE,IN HOUSE
361 POSTMORTEM,POST MORTEM
362 LITTLEKNOWN,LITTLE KNOWN
363 FALSEPOSITIVE,FALSE POSITIVE
364 ANTIVAXXER,ANTI VAXXER
365 EMAILS,E MAILS
366 DRIVETHROUGH,DRIVE THROUGH
367 DAYTODAY,DAY TO DAY
368 COSTAR,CO STAR
369 EBAY,E BAY
370 KOOLAID,KOOL AID
371 ANTIDEMOCRATIC,ANTI DEMOCRATIC
372 MIDDLEAGED,MIDDLE AGED
373 SHORTLIVED,SHORT LIVED
374 BESTSELLING,BEST SELLING
375 TICTACS,TIC TACS
376 UHHUH,UH HUH
377 MULTITANK,MULTI TANK
378 JAWDROPPING,JAW DROPPING
379 LIVESTREAMING,LIVE STREAMING
380 HARDWORKING,HARD WORKING
381 BOTTOMDWELLING,BOTTOM DWELLING
382 PRESHOW,PRE SHOW
383 HANDSFREE,HANDS FREE
384 TRICKORTREATING,TRICK OR TREATING
385 PRERECORDED,PRE RECORDED
386 DOGOODERS,DO GOODERS
387 WIDERANGING,WIDE RANGING
388 LIFESAVING,LIFE SAVING
389 SKIREPORT,SKI REPORT
390 SNOWBASE,SNOW BASE
391 JAYZ,JAY Z
392 SPIDERMAN,SPIDER MAN
393 FREEKICK,FREE KICK
394 EDWARDSHELAIRE,EDWARDS HELAIRE
395 SHORTTERM,SHORT TERM
396 HAVENOTS,HAVE NOTS
397 SELFINTEREST,SELF INTEREST
398 SELFINTERESTED,SELF INTERESTED
399 SELFCOMPASSION,SELF COMPASSION
400 MACHINELEARNING,MACHINE LEARNING
401 COAUTHORED,CO AUTHORED
402 NONGOVERNMENT,NON GOVERNMENT
403 SUBSAHARAN,SUB SAHARAN
404 COCHAIR,CO CHAIR
405 LARGESCALE,LARGE SCALE
406 VIDEOONDEMAND,VIDEO ON DEMAND
407 FIRSTCLASS,FIRST CLASS
408 COFOUNDERS,CO FOUNDERS
409 COOP,CO OP
410 PREORDERS,PRE ORDERS
411 DOUBLEENTRY,DOUBLE ENTRY
412 SELFCONFIDENT,SELF CONFIDENT
413 SELFPORTRAIT,SELF PORTRAIT
414 NONWHITE,NON WHITE
415 ONBOARD,ON BOARD
416 HALFLIFE,HALF LIFE
417 ONCOURT,ON COURT
418 SCIFI,SCI FI
419 XMEN,X MEN
420 DAYLEWIS,DAY LEWIS
421 LALALAND,LA LA LAND
422 AWARDWINNING,AWARD WINNING
423 BOXOFFICE,BOX OFFICE
424 TRIDACTYLS,TRI DACTYLS
425 TRIDACTYL,TRI DACTYL
426 MEDIUMSIZED,MEDIUM SIZED
427 POSTSECONDARY,POST SECONDARY
428 FULLTIME,FULL TIME
429 GOKART,GO KART
430 OPENAIR,OPEN AIR
431 WELLKNOWN,WELL KNOWN
432 ICECREAM,ICE CREAM
433 EARTHMOON,EARTH MOON
434 STATEOFTHEART,STATE OF THE ART
435 BSIDE,B SIDE
436 EASTWEST,EAST WEST
437 ALLSTAR,ALL STAR
438 RUNNERUP,RUNNER UP
439 HORSEDRAWN,HORSE DRAWN
440 OPENSOURCE,OPEN SOURCE
441 PURPOSEBUILT,PURPOSE BUILT
442 SQUAREFREE,SQUARE FREE
443 PRESENTDAY,PRESENT DAY
444 CANADAUNITED,CANADA UNITED
445 HOTCHPOTCH,HOTCH POTCH
446 LOWLYING,LOW LYING
447 RIGHTHANDED,RIGHT HANDED
448 PEARSHAPED,PEAR SHAPED
449 BESTKNOWN,BEST KNOWN
450 FULLLENGTH,FULL LENGTH
451 YEARROUND,YEAR ROUND
452 PREELECTION,PRE ELECTION
453 RERECORD,RE RECORD
454 MINIALBUM,MINI ALBUM
455 LONGESTRUNNING,LONGEST RUNNING
456 ALLIRELAND,ALL IRELAND
457 NORTHWESTERN,NORTH WESTERN
458 PARTTIME,PART TIME
459 NONGOVERNMENTAL,NON GOVERNMENTAL
460 ONLINE,ON LINE
461 ONAIR,ON AIR
462 NORTHSOUTH,NORTH SOUTH
463 RERELEASED,RE RELEASED
464 LEFTHANDED,LEFT HANDED
465 BSIDES,B SIDES
466 ANGLOSAXON,ANGLO SAXON
467 SOUTHSOUTHEAST,SOUTH SOUTHEAST
468 CROSSCOUNTRY,CROSS COUNTRY
469 REBUILT,RE BUILT
470 FREEFORM,FREE FORM
471 SCOOBYDOO,SCOOBY DOO
472 ATLARGE,AT LARGE
473 COUNCILMANAGER,COUNCIL MANAGER
474 LONGRUNNING,LONG RUNNING
475 PREWAR,PRE WAR
476 REELECTED,RE ELECTED
477 HIGHSCHOOL,HIGH SCHOOL
478 RUNNERSUP,RUNNERS UP
479 NORTHWEST,NORTH WEST
480 WEBBASED,WEB BASED
481 HIGHQUALITY,HIGH QUALITY
482 RIGHTWING,RIGHT WING
483 LANEFOX,LANE FOX
484 PAYPERVIEW,PAY PER VIEW
485 COPRODUCTION,CO PRODUCTION
486 NONPARTISAN,NON PARTISAN
487 FIRSTPERSON,FIRST PERSON
488 WORLDRENOWNED,WORLD RENOWNED
489 VICEPRESIDENT,VICE PRESIDENT
490 PROROMAN,PRO ROMAN
491 COPRODUCED,CO PRODUCED
492 LOWPOWER,LOW POWER
493 SELFESTEEM,SELF ESTEEM
494 SEMITRANSPARENT,SEMI TRANSPARENT
495 SECONDINCOMMAND,SECOND IN COMMAND
496 HIGHRISE,HIGH RISE
497 COHOSTED,CO HOSTED
498 AFRICANAMERICAN,AFRICAN AMERICAN
499 SOUTHWEST,SOUTH WEST
500 WELLPRESERVED,WELL PRESERVED
501 FEATURELENGTH,FEATURE LENGTH
502 HIPHOP,HIP HOP
503 ALLBIG,ALL BIG
504 SOUTHEAST,SOUTH EAST
505 COUNTERATTACK,COUNTER ATTACK
506 QUARTERFINALS,QUARTER FINALS
507 STABLEDOOR,STABLE DOOR
508 DARKEYED,DARK EYED
509 ALLAMERICAN,ALL AMERICAN
510 THIRDPERSON,THIRD PERSON
511 LOWLEVEL,LOW LEVEL
512 NTERMINAL,N TERMINAL
513 DRIEDUP,DRIED UP
514 AFRICANAMERICANS,AFRICAN AMERICANS
515 ANTIAPARTHEID,ANTI APARTHEID
516 STOKEONTRENT,STOKE ON TRENT
517 NORTHNORTHEAST,NORTH NORTHEAST
518 BRANDNEW,BRAND NEW
519 RIGHTANGLED,RIGHT ANGLED
520 GOVERNMENTOWNED,GOVERNMENT OWNED
521 SONINLAW,SON IN LAW
522 SUBJECTOBJECTVERB,SUBJECT OBJECT VERB
523 LEFTARM,LEFT ARM
524 LONGLIVED,LONG LIVED
525 REDEYE,RED EYE
526 TPOSE,T POSE
527 NIGHTVISION,NIGHT VISION
528 SOUTHEASTERN,SOUTH EASTERN
529 WELLRECEIVED,WELL RECEIVED
530 ALFAYOUM,AL FAYOUM
531 TIMEBASED,TIME BASED
532 KETTLEDRUMS,KETTLE DRUMS
533 BRIGHTEYED,BRIGHT EYED
534 REDBROWN,RED BROWN
535 SAMESEX,SAME SEX
536 PORTDEPAIX,PORT DE PAIX
537 CLEANUP,CLEAN UP
538 PERCENT,PERCENT SIGN
539 TAKEOUT,TAKE OUT
540 KNOWHOW,KNOW HOW
541 FISHBONE,FISH BONE
542 FISHSTICKS,FISH STICKS
543 PAPERWORK,PAPER WORK
544 NICKNACKS,NICK NACKS
545 STREETTALKING,STREET TALKING
546 NONACADEMIC,NON ACADEMIC
547 SHELLY,SHELLEY
548 SHELLY'S,SHELLEY'S
549 JIMMY,JIMMIE
550 JIMMY'S,JIMMIE'S
551 DRUGSTORE,DRUG STORE
552 THRU,THROUGH
553 PLAYDATE,PLAY DATE
554 MICROLIFE,MICRO LIFE
555 SKILLSET,SKILL SET
556 SKILLSETS,SKILL SETS
557 TRADEOFF,TRADE OFF
558 TRADEOFFS,TRADE OFFS
559 ONSCREEN,ON SCREEN
560 PLAYBACK,PLAY BACK
561 ARTWORK,ART WORK
562 COWORKER,CO WORDER
563 COWORKERS,CO WORDERS
564 SOMETIME,SOME TIME
565 SOMETIMES,SOME TIMES
566 CROWDFUNDING,CROWD FUNDING
567 AM,A.M.,A M
568 PM,P.M.,P M
569 TV,T V
570 MBA,M B A
571 USA,U S A
572 US,U S
573 UK,U K
574 CEO,C E O
575 CFO,C F O
576 COO,C O O
577 CIO,C I O
578 FM,F M
579 GMC,G M C
580 FSC,F S C
581 NPD,N P D
582 APM,A P M
583 NGO,N G O
584 TD,T D
585 LOL,L O L
586 IPO,I P O
587 CNBC,C N B C
588 IPOS,I P OS
589 CNBC's,C N B C'S
590 JT,J T
591 NPR,N P R
592 NPR'S,N P R'S
593 MP,M P
594 IOI,I O I
595 DW,D W
596 CNN,C N N
597 WSM,W S M
598 ET,E T
599 IT,I T
600 RJ,R J
601 DVD,D V D
602 DVD'S,D V D'S
603 HBO,H B O
604 LA,L A
605 XC,X C
606 SUV,S U V
607 NBA,N B A
608 NBA'S,N B A'S
609 ESPN,E S P N
610 ESPN'S,E S P N'S
611 ADT,A D T
612 HD,H D
613 VIP,V I P
614 TMZ,T M Z
615 CBC,C B C
616 NPO,N P O
617 BBC,B B C
618 LA'S,L A'S
619 TMZ'S,T M Z'S
620 HIV,H I V
621 FTC,F T C
622 EU,E U
623 PHD,P H D
624 AI,A I
625 FHI,F H I
626 ICML,I C M L
627 ICLR,I C L R
628 BMW,B M W
629 EV,E V
630 CR,C R
631 API,A P I
632 ICO,I C O
633 LTE,L T E
634 OBS,O B S
635 PC,P C
636 IO,I O
637 CRM,C R M
638 RTMP,R T M P
639 ASMR,A S M R
640 GG,G G
641 WWW,W W W
642 PEI,P E I
643 JJ,J J
644 PT,P T
645 DJ,D J
646 SD,S D
647 POW,P.O.W.,P O W
648 FYI,F Y I
649 DC,D C,D.C
650 ABC,A B C
651 TJ,T J
652 WMDT,W M D T
653 WDTN,W D T N
654 TY,T Y
655 EJ,E J
656 CJ,C J
657 ACL,A C L
658 UK'S,U K'S
659 GTV,G T V
660 MDMA,M D M A
661 DFW,D F W
662 WTF,W T F
663 AJ,A J
664 MD,M D
665 PH,P H
666 ID,I D
667 SEO,S E O
668 UTM'S,U T M'S
669 EC,E C
670 UFC,U F C
671 RV,R V
672 UTM,U T M
673 CSV,C S V
674 SMS,S M S
675 GRB,G R B
676 GT,G T
677 LEM,L E M
678 XR,X R
679 EDU,E D U
680 NBC,N B C
681 EMS,E M S
682 CDC,C D C
683 MLK,M L K
684 IE,I E
685 OC,O C
686 HR,H R
687 MA,M A
688 DEE,D E E
689 AP,A P
690 UFO,U F O
691 DE,D E
692 LGBTQ,L G B T Q
693 PTA,P T A
694 NHS,N H S
695 CMA,C M A
696 MGM,M G M
697 AKA,A K A
698 HW,H W
699 GOP,G O P
700 GOP'S,G O P'S
701 FBI,F B I
702 PRX,P R X
703 CTO,C T O
704 URL,U R L
705 EIN,E I N
706 MLS,M L S
707 CSI,C S I
708 AOC,A O C
709 CND,C N D
710 CP,C P
711 PP,P P
712 CLI,C L I
713 PB,P B
714 FDA,F D A
715 MRNA,M R N A
716 PR,P R
717 VP,V P
718 DNC,D N C
719 MSNBC,M S N B C
720 GQ,G Q
721 UT,U T
722 XXI,X X I
723 HRV,H R V
724 WHO,W H O
725 CRO,C R O
726 DPA,D P A
727 PPE,P P E
728 EVA,E V A
729 BP,B P
730 GPS,G P S
731 AR,A R
732 PJ,P J
733 MLM,M L M
734 OLED,O L E D
735 BO,B O
736 VE,V E
737 UN,U N
738 SLS,S L S
739 DM,D M
740 DM'S,D M'S
741 ASAP,A S A P
742 ETA,E T A
743 DOB,D O B
744 BMW,B M W

View File

@@ -0,0 +1,20 @@
ach
ah
eee
eh
er
ew
ha
hee
hm
hmm
hmmm
huh
mm
mmm
oof
uh
uhh
um
oh
hum
1 ach
2 ah
3 eee
4 eh
5 er
6 ew
7 ha
8 hee
9 hm
10 hmm
11 hmmm
12 huh
13 mm
14 mmm
15 oof
16 uh
17 uhh
18 um
19 oh
20 hum

View File

@@ -0,0 +1 @@
nemo_version from commit:eae1684f7f33c2a18de9ecfa42ec7db93d39e631

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,10 @@
# Text Normalization
Text Normalization is part of NeMo's `nemo_text_processing` - a Python package that is installed with the `nemo_toolkit`.
It converts text from written form into its verbalized form, e.g. "123" -> "one hundred twenty three".
See [NeMo documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details.
Tutorial with overview of the package capabilities: [Text_(Inverse)_Normalization.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb)
Tutorial on how to customize the underlying gramamrs: [WFST_Tutorial.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,350 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
import string
from collections import defaultdict, namedtuple
from typing import Dict, List, Optional, Set, Tuple
from unicodedata import category
EOS_TYPE = "EOS"
PUNCT_TYPE = "PUNCT"
PLAIN_TYPE = "PLAIN"
Instance = namedtuple('Instance', 'token_type un_normalized normalized')
known_types = [
"PLAIN",
"DATE",
"CARDINAL",
"LETTERS",
"VERBATIM",
"MEASURE",
"DECIMAL",
"ORDINAL",
"DIGIT",
"MONEY",
"TELEPHONE",
"ELECTRONIC",
"FRACTION",
"TIME",
"ADDRESS",
]
def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]:
"""
https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
Loads text file in the Kaggle Google text normalization file format: <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
E.g.
PLAIN Brillantaisia <self>
PLAIN is <self>
PLAIN a <self>
PLAIN genus <self>
PLAIN of <self>
PLAIN plant <self>
PLAIN in <self>
PLAIN family <self>
PLAIN Acanthaceae <self>
PUNCT . sil
<eos> <eos>
Args:
file_path: file path to text file
Returns: flat list of instances
"""
res = []
with open(file_path, 'r') as fp:
for line in fp:
parts = line.strip().split("\t")
if parts[0] == "<eos>":
res.append(Instance(token_type=EOS_TYPE, un_normalized="", normalized=""))
else:
l_type, l_token, l_normalized = parts
l_token = l_token.lower()
l_normalized = l_normalized.lower()
if l_type == PLAIN_TYPE:
res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_token))
elif l_type != PUNCT_TYPE:
res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_normalized))
return res
def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]:
"""
Load given list of text files using the `load_func` function.
Args:
file_paths: list of file paths
load_func: loading function
Returns: flat list of instances
"""
res = []
for file_path in file_paths:
res.extend(load_func(file_path=file_path))
return res
def clean_generic(text: str) -> str:
"""
Cleans text without affecting semiotic classes.
Args:
text: string
Returns: cleaned string
"""
text = text.strip()
text = text.lower()
return text
def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = None, verbose: bool = True) -> float:
"""
Evaluates accuracy given predictions and labels.
Args:
preds: predictions
labels: labels
input: optional, only needed for verbosity
verbose: if true prints [input], golden labels and predictions
Returns accuracy
"""
acc = 0
nums = len(preds)
for i in range(nums):
pred_norm = clean_generic(preds[i])
label_norm = clean_generic(labels[i])
if pred_norm == label_norm:
acc = acc + 1
else:
if input:
print(f"inpu: {json.dumps(input[i])}")
print(f"gold: {json.dumps(label_norm)}")
print(f"pred: {json.dumps(pred_norm)}")
return acc / nums
def training_data_to_tokens(
data: List[Instance], category: Optional[str] = None
) -> Dict[str, Tuple[List[str], List[str]]]:
"""
Filters the instance list by category if provided and converts it into a map from token type to list of un_normalized and normalized strings
Args:
data: list of instances
category: optional semiotic class category name
Returns Dict: token type -> (list of un_normalized strings, list of normalized strings)
"""
result = defaultdict(lambda: ([], []))
for instance in data:
if instance.token_type != EOS_TYPE:
if category is None or instance.token_type == category:
result[instance.token_type][0].append(instance.un_normalized)
result[instance.token_type][1].append(instance.normalized)
return result
def training_data_to_sentences(data: List[Instance]) -> Tuple[List[str], List[str], List[Set[str]]]:
"""
Takes instance list, creates list of sentences split by EOS_Token
Args:
data: list of instances
Returns (list of unnormalized sentences, list of normalized sentences, list of sets of categories in a sentence)
"""
# split data at EOS boundaries
sentences = []
sentence = []
categories = []
sentence_categories = set()
for instance in data:
if instance.token_type == EOS_TYPE:
sentences.append(sentence)
sentence = []
categories.append(sentence_categories)
sentence_categories = set()
else:
sentence.append(instance)
sentence_categories.update([instance.token_type])
un_normalized = [" ".join([instance.un_normalized for instance in sentence]) for sentence in sentences]
normalized = [" ".join([instance.normalized for instance in sentence]) for sentence in sentences]
return un_normalized, normalized, categories
def post_process_punctuation(text: str) -> str:
"""
Normalized quotes and spaces
Args:
text: text
Returns: text with normalized spaces and quotes
"""
text = (
text.replace('( ', '(')
.replace(' )', ')')
.replace('{ ', '{')
.replace(' }', '}')
.replace('[ ', '[')
.replace(' ]', ']')
.replace(' ', ' ')
.replace('', '"')
.replace("", "'")
.replace("»", '"')
.replace("«", '"')
.replace("\\", "")
.replace("", '"')
.replace("´", "'")
.replace("", "'")
.replace('', '"')
.replace("", "'")
.replace('`', "'")
.replace('- -', "--")
)
for punct in "!,.:;?":
text = text.replace(f' {punct}', punct)
return text.strip()
def pre_process(text: str) -> str:
"""
Optional text preprocessing before normalization (part of TTS TN pipeline)
Args:
text: string that may include semiotic classes
Returns: text with spaces around punctuation marks
"""
space_both = '[]'
for punct in space_both:
text = text.replace(punct, ' ' + punct + ' ')
# remove extra space
text = re.sub(r' +', ' ', text)
return text
def load_file(file_path: str) -> List[str]:
"""
Loads given text file with separate lines into list of string.
Args:
file_path: file path
Returns: flat list of string
"""
res = []
with open(file_path, 'r') as fp:
for line in fp:
res.append(line)
return res
def write_file(file_path: str, data: List[str]):
"""
Writes out list of string to file.
Args:
file_path: file path
data: list of string
"""
with open(file_path, 'w') as fp:
for line in data:
fp.write(line + '\n')
def post_process_punct(input: str, normalized_text: str, add_unicode_punct: bool = False):
"""
Post-processing of the normalized output to match input in terms of spaces around punctuation marks.
After NN normalization, Moses detokenization puts a space after
punctuation marks, and attaches an opening quote "'" to the word to the right.
E.g., input to the TN NN model is "12 test' example",
after normalization and detokenization -> "twelve test 'example" (the quote is considered to be an opening quote,
but it doesn't match the input and can cause issues during TTS voice generation.)
The current function will match the punctuation and spaces of the normalized text with the input sequence.
"12 test' example" -> "twelve test 'example" -> "twelve test' example" (the quote was shifted to match the input).
Args:
input: input text (original input to the NN, before normalization or tokenization)
normalized_text: output text (output of the TN NN model)
add_unicode_punct: set to True to handle unicode punctuation marks as well as default string.punctuation (increases post processing time)
"""
# in the post-processing WFST graph "``" are repalced with '"" quotes (otherwise single quotes "`" won't be handled correctly)
# this function fixes spaces around them based on input sequence, so here we're making the same double quote replacement
# to make sure these new double quotes work with this function
if "``" in input and "``" not in normalized_text:
input = input.replace("``", '"')
input = [x for x in input]
normalized_text = [x for x in normalized_text]
punct_marks = [x for x in string.punctuation if x in input]
if add_unicode_punct:
punct_unicode = [
chr(i)
for i in range(sys.maxunicode)
if category(chr(i)).startswith("P") and chr(i) not in punct_default and chr(i) in input
]
punct_marks = punct_marks.extend(punct_unicode)
for punct in punct_marks:
try:
equal = True
if input.count(punct) != normalized_text.count(punct):
equal = False
idx_in, idx_out = 0, 0
while punct in input[idx_in:]:
idx_out = normalized_text.index(punct, idx_out)
idx_in = input.index(punct, idx_in)
def _is_valid(idx_out, idx_in, normalized_text, input):
"""Check if previous or next word match (for cases when punctuation marks are part of
semiotic token, i.e. some punctuation can be missing in the normalized text)"""
return (idx_out > 0 and idx_in > 0 and normalized_text[idx_out - 1] == input[idx_in - 1]) or (
idx_out < len(normalized_text) - 1
and idx_in < len(input) - 1
and normalized_text[idx_out + 1] == input[idx_in + 1]
)
if not equal and not _is_valid(idx_out, idx_in, normalized_text, input):
idx_in += 1
continue
if idx_in > 0 and idx_out > 0:
if normalized_text[idx_out - 1] == " " and input[idx_in - 1] != " ":
normalized_text[idx_out - 1] = ""
elif normalized_text[idx_out - 1] != " " and input[idx_in - 1] == " ":
normalized_text[idx_out - 1] += " "
if idx_in < len(input) - 1 and idx_out < len(normalized_text) - 1:
if normalized_text[idx_out + 1] == " " and input[idx_in + 1] != " ":
normalized_text[idx_out + 1] = ""
elif normalized_text[idx_out + 1] != " " and input[idx_in + 1] == " ":
normalized_text[idx_out] = normalized_text[idx_out] + " "
idx_out += 1
idx_in += 1
except:
pass
normalized_text = "".join(normalized_text)
return re.sub(r' +', ' ', normalized_text)

View File

@@ -0,0 +1,17 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst

View File

@@ -0,0 +1,342 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from argparse import ArgumentParser
from typing import List
import regex as re
from nemo_text_processing.text_normalization.data_loader_utils import (
EOS_TYPE,
Instance,
load_files,
training_data_to_sentences,
)
"""
This file is for evaluation purposes.
filter_loaded_data() cleans data (list of instances) for text normalization. Filters and cleaners can be specified for each semiotic class individually.
For example, normalized text should only include characters and whitespace characters but no punctuation.
Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
"""
class Filter:
"""
Filter class
Args:
class_type: semiotic class used in dataset
process_func: function to transform text
filter_func: function to filter text
"""
def __init__(self, class_type: str, process_func: object, filter_func: object):
self.class_type = class_type
self.process_func = process_func
self.filter_func = filter_func
def filter(self, instance: Instance) -> bool:
"""
filter function
Args:
filters given instance with filter function
Returns: True if given instance fulfills criteria or does not belong to class type
"""
if instance.token_type != self.class_type:
return True
return self.filter_func(instance)
def process(self, instance: Instance) -> Instance:
"""
process function
Args:
processes given instance with process function
Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
return instance
return self.process_func(instance)
def filter_cardinal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_cardinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[^0-9]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_ordinal_1(instance: Instance) -> bool:
ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
return ok
def process_ordinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[,\s]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_decimal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_decimal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_measure_1(instance: Instance) -> bool:
ok = True
return ok
def process_measure_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"m2", "", un_normalized)
un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
normalized = re.sub(r"[^a-z\s]", "", normalized)
normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_money_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_money_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"a\$", r"$", un_normalized)
un_normalized = re.sub(r"us\$", r"$", un_normalized)
un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_time_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_time_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r": ", ":", un_normalized)
un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_plain_1(instance: Instance) -> bool:
ok = True
return ok
def process_plain_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_punct_1(instance: Instance) -> bool:
ok = True
return ok
def process_punct_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_date_1(instance: Instance) -> bool:
ok = True
return ok
def process_date_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_letters_1(instance: Instance) -> bool:
ok = True
return ok
def process_letters_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_verbatim_1(instance: Instance) -> bool:
ok = True
return ok
def process_verbatim_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_digit_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_digit_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_telephone_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_telephone_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_electronic_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_electronic_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_fraction_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_fraction_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_address_1(instance: Instance) -> bool:
ok = True
return ok
def process_address_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
filters = []
filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1))
filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1))
filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1))
filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1))
filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1))
filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1))
filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1))
filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1))
filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1))
filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1))
filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
"""
Filters list of instances
Args:
data: list of instances
Returns: filtered and transformed list of instances
"""
updates_instances = []
for instance in data:
updated_instance = False
for fil in filters:
if fil.class_type == instance.token_type and fil.filter(instance):
instance = fil.process(instance)
updated_instance = True
if updated_instance:
if verbose:
print(instance)
updates_instances.append(instance)
return updates_instances
def parse_args():
parser = ArgumentParser()
parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100')
parser.add_argument("--verbose", help="print filtered instances", action='store_true')
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
file_path = args.input
print("Loading training data: " + file_path)
instance_list = load_files([file_path]) # List of instances
filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
training_data_to_sentences(filtered_instance_list)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,14 @@
st Street
street Street
expy Expressway
fwy Freeway
hwy Highway
dr Drive
ct Court
ave Avenue
av Avenue
cir Circle
blvd Boulevard
alley Alley
way Way
jct Junction
1 st Street
2 street Street
3 expy Expressway
4 fwy Freeway
5 hwy Highway
6 dr Drive
7 ct Court
8 ave Avenue
9 av Avenue
10 cir Circle
11 blvd Boulevard
12 alley Alley
13 way Way
14 jct Junction

View File

@@ -0,0 +1,52 @@
Alabama AL
Alaska AK
Arizona AZ
Arkansas AR
California CA
Colorado CO
Connecticut CT
Delaware DE
Florida FL
Georgia GA
Hawaii HI
Idaho ID
Illinois IL
Indiana IN
Indiana IND
Iowa IA
Kansas KS
Kentucky KY
Louisiana LA
Maine ME
Maryland MD
Massachusetts MA
Michigan MI
Minnesota MN
Mississippi MS
Missouri MO
Montana MT
Nebraska NE
Nevada NV
New Hampshire NH
New Jersey NJ
New Mexico NM
New York NY
North Carolina NC
North Dakota ND
Ohio OH
Oklahoma OK
Oregon OR
Pennsylvania PA
Rhode Island RI
South Carolina SC
South Dakota SD
Tennessee TN
Tennessee TENN
Texas TX
Utah UT
Vermont VT
Virginia VA
Washington WA
West Virginia WV
Wisconsin WI
Wyoming WY
1 Alabama AL
2 Alaska AK
3 Arizona AZ
4 Arkansas AR
5 California CA
6 Colorado CO
7 Connecticut CT
8 Delaware DE
9 Florida FL
10 Georgia GA
11 Hawaii HI
12 Idaho ID
13 Illinois IL
14 Indiana IN
15 Indiana IND
16 Iowa IA
17 Kansas KS
18 Kentucky KY
19 Louisiana LA
20 Maine ME
21 Maryland MD
22 Massachusetts MA
23 Michigan MI
24 Minnesota MN
25 Mississippi MS
26 Missouri MO
27 Montana MT
28 Nebraska NE
29 Nevada NV
30 New Hampshire NH
31 New Jersey NJ
32 New Mexico NM
33 New York NY
34 North Carolina NC
35 North Dakota ND
36 Ohio OH
37 Oklahoma OK
38 Oregon OR
39 Pennsylvania PA
40 Rhode Island RI
41 South Carolina SC
42 South Dakota SD
43 Tennessee TN
44 Tennessee TENN
45 Texas TX
46 Utah UT
47 Vermont VT
48 Virginia VA
49 Washington WA
50 West Virginia WV
51 Wisconsin WI
52 Wyoming WY

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,31 @@
one
two
three
four
five
six
seven
eight
nine
ten
eleven
twelve
thirteen
fourteen
fifteen
sixteen
seventeen
eighteen
nineteen
twenty
twenty one
twenty two
twenty three
twenty four
twenty five
twenty six
twenty seven
twenty eight
twenty nine
thirty
thirty one
1 one
2 two
3 three
4 four
5 five
6 six
7 seven
8 eight
9 nine
10 ten
11 eleven
12 twelve
13 thirteen
14 fourteen
15 fifteen
16 sixteen
17 seventeen
18 eighteen
19 nineteen
20 twenty
21 twenty one
22 twenty two
23 twenty three
24 twenty four
25 twenty five
26 twenty six
27 twenty seven
28 twenty eight
29 twenty nine
30 thirty
31 thirty one

View File

@@ -0,0 +1,12 @@
jan january
feb february
mar march
apr april
jun june
jul july
aug august
sep september
sept september
oct october
nov november
dec december
1 jan january
2 feb february
3 mar march
4 apr april
5 jun june
6 jul july
7 aug august
8 sep september
9 sept september
10 oct october
11 nov november
12 dec december

View File

@@ -0,0 +1,12 @@
january
february
march
april
may
june
july
august
september
october
november
december
1 january
2 february
3 march
4 april
5 may
6 june
7 july
8 august
9 september
10 october
11 november
12 december

View File

@@ -0,0 +1,24 @@
1 january
2 february
3 march
4 april
5 may
6 june
7 july
8 august
9 september
10 october
11 november
12 december
01 january
02 february
03 march
04 april
05 may
06 june
07 july
08 august
09 september
10 october
11 november
12 december
1 1 january
2 2 february
3 3 march
4 4 april
5 5 may
6 6 june
7 7 july
8 8 august
9 9 september
10 10 october
11 11 november
12 12 december
13 01 january
14 02 february
15 03 march
16 04 april
17 05 may
18 06 june
19 07 july
20 08 august
21 09 september
22 10 october
23 11 november
24 12 december

View File

@@ -0,0 +1,16 @@
A. D AD
A.D AD
a. d AD
a.d AD
a. d. AD
a.d. AD
B. C BC
B.C BC
b. c BC
b.c BC
A. D. AD
A.D. AD
B. C. BC
B.C. BC
b. c. BC
b.c. BC
1 A. D AD
2 A.D AD
3 a. d AD
4 a.d AD
5 a. d. AD
6 a.d. AD
7 B. C BC
8 B.C BC
9 b. c BC
10 b.c BC
11 A. D. AD
12 A.D. AD
13 B. C. BC
14 B.C. BC
15 b. c. BC
16 b.c. BC

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,12 @@
.com dot com
.org dot org
.gov dot gov
.uk dot UK
.fr dot FR
.net dot net
.br dot BR
.in dot IN
.ru dot RU
.de dot DE
.it dot IT
.jpg dot jpeg
1 .com dot com
2 .org dot org
3 .gov dot gov
4 .uk dot UK
5 .fr dot FR
6 .net dot net
7 .br dot BR
8 .in dot IN
9 .ru dot RU
10 .de dot DE
11 .it dot IT
12 .jpg dot jpeg

View File

@@ -0,0 +1,21 @@
. dot
- dash
_ underscore
! exclamation mark
# number sign
$ dollar sign
% percent sign
& ampersand
' quote
* asterisk
+ plus
/ slash
= equal sign
? question mark
^ circumflex
` right single quote
{ left brace
| vertical bar
} right brace
~ tilde
, comma
1 . dot
2 - dash
3 _ underscore
4 ! exclamation mark
5 # number sign
6 $ dollar sign
7 % percent sign
8 & ampersand
9 ' quote
10 * asterisk
11 + plus
12 / slash
13 = equal sign
14 ? question mark
15 ^ circumflex
16 ` right single quote
17 { left brace
18 | vertical bar
19 } right brace
20 ~ tilde
21 , comma

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,8 @@
+ plus
- minus
/ divided
÷ divided
: divided
× times
* times
· times
1 + plus
2 - minus
3 / divided
4 ÷ divided
5 : divided
6 × times
7 * times
8 · times

View File

@@ -0,0 +1,127 @@
amu atomic mass unit
bar bar
° degree
º degree
°c degree Celsius
°C degree Celsius
ºc degree Celsius
ºC degree Celsius
℃ degree Celsius
cm2 square centimeter
cm² square centimeter
cm3 cubic centimeter
cm³ cubic centimeter
cm centimeter
cwt hundredweight
db decibel
dm3 cubic decimeter
dm³ cubic decimeter
dm decimeter
ds decisecond
°f degree Fahrenheit
°F degree Fahrenheit
℉ degree Fahrenheit
ft foot
ghz gigahertz
gw gigawatt
gwh gigawatt hour
hz hertz
" inch
kbps kilobit per second
kcal kilo calory
kgf kilogram force
kg kilogram
khz kilohertz
km2 square kilometer
km² square kilometer
km3 cubic kilometer
km³ cubic kilometer
km kilometer
kpa kilopascal
kwh kilowatt hour
kw kilowatt
kW kilowatt
lb pound
lbs pound
m2 square meter
m² square meter
m3 cubic meter
m³ cubic meter
mbps megabit per second
mg milligram
mhz megahertz
mi2 square mile
mi² square mile
mi3 cubic mile
mi³ cubic mile
cu mi cubic mile
mi mile
min minute
ml milliliter
mm2 square millimeter
mm² square millimeter
mol mole
mpa megapascal
mph mile per hour
ng nanogram
nm nanometer
ns nanosecond
oz ounce
pa pascal
% percent
rad radian
rpm revolution per minute
sq ft square foot
sq mi square mile
sv sievert
tb terabyte
tj terajoule
tl teraliter
v volt
yd yard
μg microgram
μm micrometer
μs microsecond
ω ohm
atm ATM
au AU
bq BQ
cc CC
cd CD
da DA
eb EB
ev EV
f F
gb GB
g G
gl GL
gpa GPA
gy GY
ha HA
h H
hl HL
hp GP
hs HS
kb KB
kl KL
kn KN
kt KT
kv KV
lm LM
ma MA
mA MA
mb MB
mc MC
mf MF
m M
mm MM
ms MS
mv MV
mw MW
pb PB
pg PG
ps PS
s S
tb TB
tb YB
zb ZB
Can't render this file because it contains an unexpected character in line 127 and column 6.

View File

@@ -0,0 +1,43 @@
atm atmosphere
bq becquerel
cd candela
da dalton
eb exabyte
f degree Fahrenheit
gb gigabyte
g gram
gl gigaliter
ha hectare
h hour
hl hectoliter
hp horsepower
hp horsepower
kb kilobit
kb kilobyte
ma megaampere
mA megaampere
ma milliampere
mA milliampere
mb megabyte
mc megacoulomb
mf megafarad
m meter
m minute
mm millimeter
mm millimeter
mm millimeter
ms megasecond
ms mega siemens
ms millisecond
mv millivolt
mV millivolt
mw megawatt
mW megawatt
pb petabyte
pg petagram
ps petasecond
s second
tb terabyte
tb terabyte
yb yottabyte
zb zettabyte
1 atm atmosphere
2 bq becquerel
3 cd candela
4 da dalton
5 eb exabyte
6 f degree Fahrenheit
7 gb gigabyte
8 g gram
9 gl gigaliter
10 ha hectare
11 h hour
12 hl hectoliter
13 hp horsepower
14 hp horsepower
15 kb kilobit
16 kb kilobyte
17 ma megaampere
18 mA megaampere
19 ma milliampere
20 mA milliampere
21 mb megabyte
22 mc megacoulomb
23 mf megafarad
24 m meter
25 m minute
26 mm millimeter
27 mm millimeter
28 mm millimeter
29 ms megasecond
30 ms mega siemens
31 ms millisecond
32 mv millivolt
33 mV millivolt
34 mw megawatt
35 mW megawatt
36 pb petabyte
37 pg petagram
38 ps petasecond
39 s second
40 tb terabyte
41 tb terabyte
42 yb yottabyte
43 zb zettabyte

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,39 @@
$ dollar
$ us dollar
US$ us dollar
฿ Thai Baht
£ pound
€ euro
₩ won
nzd new zealand dollar
rs rupee
chf swiss franc
dkk danish kroner
fim finnish markka
aed arab emirates dirham
¥ yen
czk czech koruna
mro mauritanian ouguiya
pkr pakistani rupee
crc costa rican colon
hk$ hong kong dollar
npr nepalese rupee
awg aruban florin
nok norwegian kroner
tzs tanzanian shilling
sek swedish kronor
cyp cypriot pound
r real
sar saudi riyal
cve cape verde escudo
rsd serbian dinar
dm german mark
shp saint helena pounds
php philippine peso
cad canadian dollar
ssp south sudanese pound
scr seychelles rupee
mvr maldivian rufiyaa
DH dirham
Dh dirham
Dhs. dirham
1 $ dollar
2 $ us dollar
3 US$ us dollar
4 ฿ Thai Baht
5 £ pound
6 euro
7 won
8 nzd new zealand dollar
9 rs rupee
10 chf swiss franc
11 dkk danish kroner
12 fim finnish markka
13 aed arab emirates dirham
14 ¥ yen
15 czk czech koruna
16 mro mauritanian ouguiya
17 pkr pakistani rupee
18 crc costa rican colon
19 hk$ hong kong dollar
20 npr nepalese rupee
21 awg aruban florin
22 nok norwegian kroner
23 tzs tanzanian shilling
24 sek swedish kronor
25 cyp cypriot pound
26 r real
27 sar saudi riyal
28 cve cape verde escudo
29 rsd serbian dinar
30 dm german mark
31 shp saint helena pounds
32 php philippine peso
33 cad canadian dollar
34 ssp south sudanese pound
35 scr seychelles rupee
36 mvr maldivian rufiyaa
37 DH dirham
38 Dh dirham
39 Dhs. dirham

View File

@@ -0,0 +1,4 @@
$ cents
US$ cents
€ cents
£ pence
1 $ cents
2 US$ cents
3 cents
4 £ pence

View File

@@ -0,0 +1,3 @@
$ cent
€ cent
£ penny
1 $ cent
2 cent
3 £ penny

View File

@@ -0,0 +1,2 @@
/ea each
/dozen
Can't render this file because it has a wrong number of fields in line 2.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,9 @@
one 1
two 2
three 3
four 4
five 5
six 6
seven 7
eight 8
nine 9
1 one 1
2 two 2
3 three 3
4 four 4
5 five 5
6 six 6
7 seven 7
8 eight 8
9 nine 9

View File

@@ -0,0 +1,18 @@
¼ 1/4
½ 1/2
¾ 3/4
⅐ 1/7
⅑ 1/9
⅒ 1/10
⅓ 1/3
⅔ 2/3
⅕ 1/5
⅖ 2/5
⅗ 3/5
⅘ 4/5
⅙ 1/6
⅚ 5/6
⅛ 1/8
⅜ 3/8
⅝ 5/8
⅞ 7/8
1 ¼ 1/4
2 ½ 1/2
3 ¾ 3/4
4 1/7
5 1/9
6 1/10
7 1/3
8 2/3
9 1/5
10 2/5
11 3/5
12 4/5
13 1/6
14 5/6
15 1/8
16 3/8
17 5/8
18 7/8

View File

@@ -0,0 +1 @@
hundred
1 hundred

View File

@@ -0,0 +1,10 @@
M million
MLN million
m million
mln million
B billion
b billion
BN billion
bn billion
K thousand
k thousand
1 M million
2 MLN million
3 m million
4 mln million
5 B billion
6 b billion
7 BN billion
8 bn billion
9 K thousand
10 k thousand

View File

@@ -0,0 +1,10 @@
ten 10
eleven 11
twelve 12
thirteen 13
fourteen 14
fifteen 15
sixteen 16
seventeen 17
eighteen 18
nineteen 19
1 ten 10
2 eleven 11
3 twelve 12
4 thirteen 13
5 fourteen 14
6 fifteen 15
7 sixteen 16
8 seventeen 17
9 eighteen 18
10 nineteen 19

View File

@@ -0,0 +1,22 @@
thousand
million
billion
trillion
quadrillion
quintillion
sextillion
septillion
octillion
nonillion
decillion
undecillion
duodecillion
tredecillion
quattuordecillion
quindecillion
sexdecillion
septendecillion
octodecillion
novemdecillion
vigintillion
centillion
1 thousand
2 million
3 billion
4 trillion
5 quadrillion
6 quintillion
7 sextillion
8 septillion
9 octillion
10 nonillion
11 decillion
12 undecillion
13 duodecillion
14 tredecillion
15 quattuordecillion
16 quindecillion
17 sexdecillion
18 septendecillion
19 octodecillion
20 novemdecillion
21 vigintillion
22 centillion

View File

@@ -0,0 +1,8 @@
twenty 2
thirty 3
forty 4
fifty 5
sixty 6
seventy 7
eighty 8
ninety 9
1 twenty 2
2 thirty 3
3 forty 4
4 fifty 5
5 sixty 6
6 seventy 7
7 eighty 8
8 ninety 9

View File

@@ -0,0 +1 @@
zero 0
1 zero 0

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,9 @@
first one
second two
third three
fourth four
fifth five
sixth sixth
seventh seven
eighth eight
ninth nine
1 first one
2 second two
3 third three
4 fourth four
5 fifth five
6 sixth sixth
7 seventh seven
8 eighth eight
9 ninth nine

View File

@@ -0,0 +1 @@
twelfth twelve
1 twelfth twelve

View File

@@ -0,0 +1,20 @@
`female.tsv` - List of common female names. Copyright (c) January 1991 by Mark Kantrowitz, 4987 names, Version 1.3 (29-MAR-94)
Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt)
`male.tsv` - List of common male names. Copyright (c) January 1991 by Mark Kantrowitz, 2940 names, Version 1.3 (29-MAR-94)
Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt)
[Corpora Readme.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/readme.txt):
You may use the lists of names for any purpose, so long as credit is given
in any published work. You may also redistribute the list if you
provide the recipients with a copy of this README file. The lists are
not in the public domain (I retain the copyright on the lists) but are
freely redistributable.
If you have any additions to the lists of names, I would appreciate
receiving them.
My email address is mkant+@cs.cmu.edu.
Mark Kantrowitz

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
chapter
class
part
article
section
paragraph
1 chapter
2 class
3 part
4 article
5 section
6 paragraph

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,83 @@
deer
fish
sheep
foot feet
goose geese
man men
mouse mice
tooth teeth
woman women
won
child children
ox oxen
wife wives
wolf wolves
analysis analyses
criterion criteria
lbs
focus foci
percent
hertz
kroner krone
inch inches
calory calories
yen
megahertz
gigahertz
kilohertz
hertz
CC
c c
horsepower
hundredweight
kilogram force kilograms force
mega siemens
revolution per minute revolutions per minute
mile per hour miles per hour
megabit per second megabits per second
square foot square feet
kilobit per second kilobits per second
degree Celsius degrees Celsius
degree Fahrenheit degrees Fahrenheit
ATM
AU
BQ
CC
CD
DA
EB
EV
F
GB
G
GL
GPA
GY
HA
H
HL
GP
HS
KB
KL
KN
KT
KV
LM
MA
MA
MB
MC
MF
M
MM
MS
MV
MW
PB
PG
PS
S
TB
YB
ZB
Can't render this file because it has a wrong number of fields in line 4.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,2 @@
IP address is
IP is
1 IP address is
2 IP is

View File

@@ -0,0 +1,4 @@
ssn is SSN is
ssn is SSN is
SSN is
SSN
Can't render this file because it has a wrong number of fields in line 3.

View File

@@ -0,0 +1,5 @@
call me at
reach at
reached at
my number is
hit me up at
1 call me at
2 reach at
3 reached at
4 my number is
5 hit me up at

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,12 @@
p.m. PM
p.m PM
pm PM
P.M. PM
P.M PM
PM PM
a.m. AM
a.m AM
am AM
A.M. AM
A.M AM
AM AM
1 p.m. PM
2 p.m PM
3 pm PM
4 P.M. PM
5 P.M PM
6 PM PM
7 a.m. AM
8 a.m AM
9 am AM
10 A.M. AM
11 A.M AM
12 AM AM

View File

@@ -0,0 +1,14 @@
cst CST
c.s.t CST
cet CET
c.e.t CET
pst PST
p.s.t PST
est EST
e.s.t EST
pt PT
p.t PT
et ET
e.t ET
gmt GMT
g.m.t GMT
1 cst CST
2 c.s.t CST
3 cet CET
4 c.e.t CET
5 pst PST
6 p.s.t PST
7 est EST
8 e.s.t EST
9 pt PT
10 p.t PT
11 et ET
12 e.t ET
13 gmt GMT
14 g.m.t GMT

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,45 @@
Hon. Honorable
Mr. Mister
Mrs. Misses
Ms. Miss
Mr Mister
Mrs Misses
Ms Miss
AC air conditioning
AC air conditioner
AC air conditioners
AC alternating current
&Co. and Co.
&Co. and Company
Mon Monday
Tu Tuesday
Wed Wednesday
Th Thursday
Thur Thursday
Thurs Thursday
Fri Friday
Sat Saturday
Sun Sunday
Mon Mon
Tu Tu
Wed Wed
Th Th
Thur Thur
Thurs Thurs
Fri Fri
Sat Sat
Sun Sun
= equals
# number
No. number
No number
NO number
NO. number
NO nitrogen monoxide
NO NO
NO. NO.
No. No.
No No
VOL Volume
VOL. Volume
TV Television
1 Hon. Honorable
2 Mr. Mister
3 Mrs. Misses
4 Ms. Miss
5 Mr Mister
6 Mrs Misses
7 Ms Miss
8 AC air conditioning
9 AC air conditioner
10 AC air conditioners
11 AC alternating current
12 &Co. and Co.
13 &Co. and Company
14 Mon Monday
15 Tu Tuesday
16 Wed Wednesday
17 Th Thursday
18 Thur Thursday
19 Thurs Thursday
20 Fri Friday
21 Sat Saturday
22 Sun Sunday
23 Mon Mon
24 Tu Tu
25 Wed Wed
26 Th Th
27 Thur Thur
28 Thurs Thurs
29 Fri Fri
30 Sat Sat
31 Sun Sun
32 = equals
33 # number
34 No. number
35 No number
36 NO number
37 NO. number
38 NO nitrogen monoxide
39 NO NO
40 NO. NO.
41 No. No.
42 No No
43 VOL Volume
44 VOL. Volume
45 TV Television

View File

@@ -0,0 +1,14 @@
st street
st saint
dr doctor
dr drive
mt mount
sr senior
prof professor
mt mountain
sr senior
jr junior
vol volume
rd road
ave avenue
approx approximately
1 st street
2 st saint
3 dr doctor
4 dr drive
5 mt mount
6 sr senior
7 prof professor
8 mt mountain
9 sr senior
10 jr junior
11 vol volume
12 rd road
13 ave avenue
14 approx approximately

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,521 @@
a
aoj
aəj
aː
aːʲ
aː͡j
aː͡ɨ̯
ː
ː
a͡e
a͡i
a͡iː
a͡i̯
a͡j
a͡o
a͡u
a͡uː
a͡u̯
a͡w
a͡ə
a͡ɨ̯
ɪ
a͡ʊ
b
bː
c
cː
d
dː
d̪ʱ
d͡z
d͡zʷ
d͡zː
d͡ʑ
d͡ʒ
d͡ʒʱ
d͡ʒʲ
d͡ʒː
e
eː
eːʲ
eː͡j
ː
ẽ͡j̃
ː
e͡i
e͡iː
e͡ɨ̯
f
fː
h
hː
i
iəj
iəw
iː
iːʲ
ː
i͡u
i͡ə
i͡ɛ
j
jː
k
ː
ʼ
ʼ
kʼ
kː
k̚ʲ
k̟̚
k͡p̚
l
lː
m
ː
mː
n
nː
o
oː
oːʲ
õ͡j̃
õ͡w̃
ː
o͡u
o͡uː
p
ː
ʼ
pʼ
pː
p̚ʲ
p͜f
p͡f
q
qʼ
r
rː
ː
s
sʼ
sː
t
ː
ʼ
tʼ
tː
t̪ʰ
t͜s
t͡s
t͡sʰ
t͡sʰː
t͡sʲ
t͡sʷ
t͡sʼ
t͡sː
t͡ɕ
t͡ɕʰ
t͡ɕ͈
t͡ʂ
t͡ʂʼ
t͡ʃ
t͡ʃʰ
t͡ʃʰː
t͡ʃʲ
t͡ʃʷ
t͡ʃʼ
t͡ʃː
u
uəj
uː
uːʲ
ː
ũ͡j̃
u͡e
u͡i
u͡j
u͡ɔ
u͡ə
v
vː
w
x
xː
y
yː
yːʲ
z
zː
à
àː
á
áː
â
âː
ã
ã̠
æ
æː
æ̀
æ̀ː
æ̂
æ̂ː
æ͡ɪ
æ͡ʉ
ç
è
èː
é
éː
ê
êː
ì
ìː
í
íː
î
îː
ï
ð
ò
òː
ó
óː
ô
ôː
õ
õː
õ̞
ø
øː
øːʲ
ø̯
ù
ùː
ú
úː
û
ûː
ā
āː
ē
ēː
ĕ
ĕ͡ə
ě
ěː
ħ
ĩ
ĩː
ī
īː
ŋ
ŋʲ
ŋ̊
ŋ̍
ŋ̟
ŋ̩
ŋ͡m
ō
ŏ
ŏ͡ə
œ
œː
œ̃
œ͡i
œ͡iː
œ͡ʏ
ř
řː
ũ
ũː
ū
ūː
ŭ
ŭ͡ə
ǎ
ǎː
ǐ
ǐː
ǒ
ǒː
ǔ
ǔː
ǣ
ǣː
ɐ
ɐː
ɐ̃
ɐ̃͡j̃
ɐ̃͡w̃
ɐ̯
ɐ̯̯
ɑ
ɑː
ɑ̃
ɑ̃ː
ɒ
ɒʲ
ɒː
ɓ
ɔ
ɔː
ɔˤː
ɔ̀
ɔ̀ː
ɔ́
ɔ́ː
ɔ̃
ɔ̃ː
ɔ̰
ɔ͡i̯
ɔ͡ə
ɔ͡ɨ̯
ɔ͡ɪ
ɔ͡ʊ
ɕ
ɕʰ
ɕː
ɕ͈
ɖ
ɖʱ
ɗ
ɘ
ɘː
ə
əː
əˤ
ə̀
ə́
ə̃
ə̯
ə͡u̯
ə͡w
ə͡ɨ
ə͡ɨ̯
ɚ
ɛ
ɛʲ
ɛː
ɛˤː
ɛ̀
ɛ̀ː
ɛ́
ɛ́ː
ɛ̂
ɛ̂ː
ɛ̃
ɛ̃ː
ɛ̄
ɛ̄ː
ɛ̰
ɛ͡i
ɛ͡i̯
ɛ͡u
ɛ͡u̯
ɛ͡ɪ
ɛ͡ʊ
ɜ
ɜː
ɝ
ɝː
ɟ
ɟː
ɟ͡ʝ
ɡ
ɡʱ
ɡʲ
ɡʷ
ɡː
ɡ̊
ɣ
ɤ
ɥ
ɦ
ɨ
ɨəj
ɨː
ɨ̃ᵝ
ɨ̞
ɨ̥ᵝ
ɨ̯
ɨ͡u̯
ɨ͡w
ɨ͡ə
ɨᵝ
ɨᵝː
ɪ
ɪː
ɪ̀
ɪ́
ɪ̃
ɪ̯
ɪ̰
ɪ͡u̯
ɪ͡ʊ
ɫ
ɫː
ɬ
ɬʼ
ɭ
ɮ
ɯ
ɯː
ɯ̟̃ᵝ
ɯ̟̊ᵝ
ɯ̟ᵝ
ɯ̟ᵝː
ɰ
ɰ̃
ɰᵝ
ɱ
ɱ̩
ɲ
ɲː
ɲ̊
ɲ̟
ɳ
ɴ
ɸ
ɸʷ
ɹ
ɻ
ɽ
ɽʱ
ɾ
ɾʲ
ɾː
ɾ̝̊
ʀ
ʁ
ʁʷ
ʁː
ʂ
ʂʷ
ʃ
ʃʰ
ʃʲ
ʃʷ
ʃʷʼ
ʃʼ
ʃː
ʈ
ʈʰ
ʉ
ʉː
ʊ
ʊ̀
ʊ́
ʊ̃
ʊ̯
ʊ̯͡i
ʊ̯͡ɨ
ʊ̰
ʋ
ʌ
ʌ̹
ʍ
ʎ
ʏ
ʏː
ʏ̯
ʐ
ʐʷ
ʑ
ʒ
ʒʲ
ʒʷ
ʒː
ʔ
ʔʲ
ʔʷ
ʝ
˦ˀ˥
˦˥
˦˧˥
˦˩
˧ˀ˨
˧˦
˧˧
˧˨
˧˩
˨˩
˨˩˦
˨˩˨
β
θ
χ
χʷ
χː
ẽː
ẽ̞
1 a
2 aoj
3 aəj
4
5 aːʲ
6 aː͡j
7 aː͡ɨ̯
8
9 aˤː
10
11 a̠ː
12
13 a͡e
14 a͡i
15 a͡iː
16 a͡i̯
17 a͡j
18 a͡o
19 a͡u
20 a͡uː
21 a͡u̯
22 a͡w
23 a͡ə
24 a͡ɨ̯
25 a͡ɪ
26 a͡ʊ
27 b
28
29
30
31
32 c
33
34
35
36 d
37
38
39
40
41 d̪ʱ
42 d͡z
43 d͡zʷ
44 d͡zː
45 d͡ʑ
46 d͡ʒ
47 d͡ʒʱ
48 d͡ʒʲ
49 d͡ʒː
50 e
51
52 eːʲ
53 eː͡j
54 ẽː
55 ẽ͡j̃
56
57 e̞ː
58
59 e͡i
60 e͡iː
61 e͡ɨ̯
62 f
63
64
65 h
66
67 i
68 iəj
69 iəw
70
71
72 iːʲ
73 ĩː
74
75
76 i͡u
77 i͡ə
78 i͡ɛ
79 j
80
81
82 k
83
84 kʰː
85
86 kʲʼ
87
88 kʷʼ
89
90
91
92 k̚ʲ
93 k̟̚
94
95 k͡p̚
96 l
97
98
99
100
101 m
102
103 mʲː
104
105
106
107 n
108
109
110
111
112 o
113
114
115 oːʲ
116
117 õ͡j̃
118 õ͡w̃
119
120
121 o̞ː
122
123
124 o͡u
125 o͡uː
126 p
127
128 pʰː
129
130 pʷʼ
131
132
133
134 p̚ʲ
135
136 p͜f
137 p͡f
138 q
139
140
141 r
142
143
144
145 r̂ː
146
147
148 s
149
150
151
152
153
154 t
155
156 tʰː
157
158 tʷʼ
159
160
161
162
163 t̪ʰ
164
165 t͜s
166 t͡s
167 t͡sʰ
168 t͡sʰː
169 t͡sʲ
170 t͡sʷ
171 t͡sʼ
172 t͡sː
173 t͡ɕ
174 t͡ɕʰ
175 t͡ɕ͈
176 t͡ʂ
177 t͡ʂʼ
178 t͡ʃ
179 t͡ʃʰ
180 t͡ʃʰː
181 t͡ʃʲ
182 t͡ʃʷ
183 t͡ʃʼ
184 t͡ʃː
185 u
186 uəj
187
188
189 uːʲ
190 ũː
191 ũ͡j̃
192
193 u͡e
194 u͡i
195 u͡j
196 u͡ɔ
197 u͡ə
198 v
199
200
201 w
202
203 x
204
205
206 y
207
208 yːʲ
209
210 z
211
212
213
214 à
215 àː
216 á
217 áː
218 â
219 âː
220 ã
221 ã̠
222 æ
223 æː
224 æ̀
225 æ̀ː
226 æ̂
227 æ̂ː
228 æ͡ɪ
229 æ͡ʉ
230 ç
231 è
232 èː
233 é
234 éː
235 ê
236 êː
237 ì
238 ìː
239 í
240 íː
241 î
242 îː
243 ï
244 ð
245 ò
246 òː
247 ó
248 óː
249 ô
250 ôː
251 õ
252 õː
253 õ̞
254 ø
255 øː
256 øːʲ
257 ø̯
258 ù
259 ùː
260 ú
261 úː
262 û
263 ûː
264 ā
265 āː
266 ē
267 ēː
268 ĕ
269 ĕ͡ə
270 ě
271 ěː
272 ħ
273 ĩ
274 ĩː
275 ī
276 īː
277 ŋ
278 ŋʲ
279 ŋ̊
280 ŋ̍
281 ŋ̟
282 ŋ̩
283 ŋ͡m
284 ō
285 ŏ
286 ŏ͡ə
287 œ
288 œː
289 œ̃
290 œ͡i
291 œ͡iː
292 œ͡ʏ
293 ř
294 řː
295 ũ
296 ũː
297 ū
298 ūː
299 ŭ
300 ŭ͡ə
301 ǎ
302 ǎː
303 ǐ
304 ǐː
305 ǒ
306 ǒː
307 ǔ
308 ǔː
309 ǣ
310 ǣː
311 ɐ
312 ɐː
313 ɐ̃
314 ɐ̃͡j̃
315 ɐ̃͡w̃
316 ɐ̯
317 ɐ̯̯
318 ɑ
319 ɑː
320 ɑ̃
321 ɑ̃ː
322 ɒ
323 ɒʲ
324 ɒː
325 ɓ
326 ɔ
327 ɔː
328 ɔˤː
329 ɔ̀
330 ɔ̀ː
331 ɔ́
332 ɔ́ː
333 ɔ̃
334 ɔ̃ː
335 ɔ̰
336 ɔ͡i̯
337 ɔ͡ə
338 ɔ͡ɨ̯
339 ɔ͡ɪ
340 ɔ͡ʊ
341 ɕ
342 ɕʰ
343 ɕː
344 ɕ͈
345 ɖ
346 ɖʱ
347 ɗ
348 ɘ
349 ɘː
350 ə
351 əː
352 əˤ
353 ə̀
354 ə́
355 ə̃
356 ə̯
357 ə͡u̯
358 ə͡w
359 ə͡ɨ
360 ə͡ɨ̯
361 ɚ
362 ɛ
363 ɛʲ
364 ɛː
365 ɛˤː
366 ɛ̀
367 ɛ̀ː
368 ɛ́
369 ɛ́ː
370 ɛ̂
371 ɛ̂ː
372 ɛ̃
373 ɛ̃ː
374 ɛ̄
375 ɛ̄ː
376 ɛ̰
377 ɛ͡i
378 ɛ͡i̯
379 ɛ͡u
380 ɛ͡u̯
381 ɛ͡ɪ
382 ɛ͡ʊ
383 ɜ
384 ɜː
385 ɝ
386 ɝː
387 ɟ
388 ɟː
389 ɟ͡ʝ
390 ɡ
391 ɡʱ
392 ɡʲ
393 ɡʷ
394 ɡː
395 ɡ̊
396 ɣ
397 ɤ
398 ɥ
399 ɦ
400 ɨ
401 ɨəj
402 ɨː
403 ɨ̃ᵝ
404 ɨ̞
405 ɨ̥ᵝ
406 ɨ̯
407 ɨ͡u̯
408 ɨ͡w
409 ɨ͡ə
410 ɨᵝ
411 ɨᵝː
412 ɪ
413 ɪː
414 ɪ̀
415 ɪ́
416 ɪ̃
417 ɪ̯
418 ɪ̰
419 ɪ͡u̯
420 ɪ͡ʊ
421 ɫ
422 ɫː
423 ɬ
424 ɬʼ
425 ɭ
426 ɮ
427 ɯ
428 ɯː
429 ɯ̟̃ᵝ
430 ɯ̟̊ᵝ
431 ɯ̟ᵝ
432 ɯ̟ᵝː
433 ɰ
434 ɰ̃
435 ɰᵝ
436 ɱ
437 ɱ̩
438 ɲ
439 ɲː
440 ɲ̊
441 ɲ̟
442 ɳ
443 ɴ
444 ɸ
445 ɸʷ
446 ɹ
447 ɻ
448 ɽ
449 ɽʱ
450 ɾ
451 ɾʲ
452 ɾː
453 ɾ̝̊
454 ʀ
455 ʁ
456 ʁʷ
457 ʁː
458 ʂ
459 ʂʷ
460 ʃ
461 ʃʰ
462 ʃʲ
463 ʃʷ
464 ʃʷʼ
465 ʃʼ
466 ʃː
467 ʈ
468 ʈʰ
469 ʉ
470 ʉː
471 ʊ
472 ʊ̀
473 ʊ́
474 ʊ̃
475 ʊ̯
476 ʊ̯͡i
477 ʊ̯͡ɨ
478 ʊ̰
479 ʋ
480 ʌ
481 ʌ̹
482 ʍ
483 ʎ
484 ʏ
485 ʏː
486 ʏ̯
487 ʐ
488 ʐʷ
489 ʑ
490 ʒ
491 ʒʲ
492 ʒʷ
493 ʒː
494 ʔ
495 ʔʲ
496 ʔʷ
497 ʝ
498 ˦ˀ˥
499 ˦˥
500 ˦˧˥
501 ˦˩
502 ˧ˀ˨
503 ˧˦
504 ˧˧
505 ˧˨
506 ˧˩
507 ˨˩
508 ˨˩˦
509 ˨˩˨
510 β
511 θ
512 χ
513 χʷ
514 χː
515
516
517
518
519 ẽː
520 ẽ̞
521

View File

@@ -0,0 +1,21 @@
Mr. mister
Mrs. misses
Dr. doctor
Drs. doctors
Co. company
Lt. lieutenant
Sgt. sergeant
St. saint
Jr. junior
Maj. major
Hon. honorable
Gov. governor
Capt. captain
Esq. esquire
Gen. general
Ltd. limited
Rev. reverend
Col. colonel
Mt. mount
Ft. fort
etc. et cetera
1 Mr. mister
2 Mrs. misses
3 Dr. doctor
4 Drs. doctors
5 Co. company
6 Lt. lieutenant
7 Sgt. sergeant
8 St. saint
9 Jr. junior
10 Maj. major
11 Hon. honorable
12 Gov. governor
13 Capt. captain
14 Esq. esquire
15 Gen. general
16 Ltd. limited
17 Rev. reverend
18 Col. colonel
19 Mt. mount
20 Ft. fort
21 etc. et cetera

View File

@@ -0,0 +1,23 @@
& and
# hash
@ at
§ section
™ trademark
® registered trademark
© copyright
_ underscore
% percent sign
* asterisk
+ plus
/ slash
= equal sign
^ circumflex
| vertical bar
~ tilde
$ dollar
£ pound
€ euro
₩ won
¥ yen
° degree
º degree
1 & and
2 # hash
3 @ at
4 § section
5 trademark
6 ® registered trademark
7 © copyright
8 _ underscore
9 % percent sign
10 * asterisk
11 + plus
12 / slash
13 = equal sign
14 ^ circumflex
15 | vertical bar
16 ~ tilde
17 $ dollar
18 £ pound
19 euro
20 won
21 ¥ yen
22 ° degree
23 º degree

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright 2015 and onwards Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import string
from pathlib import Path
from typing import Dict
import pynini
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini import Far
from pynini.examples import plurals
from pynini.export import export
from pynini.lib import byte, pynutil, utf8
NEMO_CHAR = utf8.VALID_UTF8_CHAR
NEMO_DIGIT = byte.DIGIT
NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter = export.Exporter(file_name)
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
print(f'Created {file_name}')
def get_plurals(fst):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return SINGULAR_TO_PLURAL @ fst
def get_singulars(fst):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> 'pynini.FstLike':
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA)
class GraphFst:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, name: str, kind: str, deterministic: bool = True):
self.name = name
self.kind = str
self._fst = None
self.deterministic = deterministic
self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
def far_exist(self) -> bool:
"""
Returns true if FAR can be loaded
"""
return self.far_path.exists()
@property
def fst(self) -> 'pynini.FstLike':
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
def add_tokens(self, fst) -> 'pynini.FstLike':
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
def delete_tokens(self, fst) -> 'pynini.FstLike':
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res = (
pynutil.delete(f"{self.name}")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ fst
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,50 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_UPPER, GraphFst, insert_space
from pynini.lib import pynutil
class AbbreviationFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. "ABC" -> tokens { abbreviation { value: "A B C" } }
Args:
whitelist: whitelist FST
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True):
super().__init__(name="abbreviation", kind="classify", deterministic=deterministic)
dot = pynini.accep(".")
# A.B.C. -> A. B. C.
graph = NEMO_UPPER + dot + pynini.closure(insert_space + NEMO_UPPER + dot, 1)
# A.B.C. -> A.B.C.
graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
# ABC -> A B C
graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
# exclude words that are included in the whitelist
graph = pynini.compose(
pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph
)
graph = pynutil.insert("value: \"") + graph.optimize() + pynutil.insert("\"")
graph = self.add_tokens(graph)
self.fst = graph.optimize()

View File

@@ -0,0 +1,138 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
NEMO_NOT_QUOTE,
NEMO_SIGMA,
GraphFst,
insert_space,
)
from nemo_text_processing.text_normalization.en.taggers.date import get_four_digit_year_graph
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.examples import plurals
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals, e.g.
-23 -> cardinal { negative: "true" integer: "twenty three" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
self.lm = lm
self.deterministic = deterministic
# TODO replace to have "oh" as a default for "0"
graph = pynini.Far(get_abs_path("data/number/cardinal_number_name.far")).get_fst()
self.graph_hundred_component_at_least_one_none_zero_digit = (
pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))
) @ graph
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
single_digits_graph = pynini.invert(graph_digit | graph_zero)
self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)
if not deterministic:
# for a single token allow only the same normalization
# "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross("0", "oh")
self.single_digits_graph = single_digits_graph_zero + pynini.closure(
insert_space + single_digits_graph_zero
)
self.single_digits_graph |= single_digits_graph_oh + pynini.closure(insert_space + single_digits_graph_oh)
single_digits_graph_with_commas = pynini.closure(
self.single_digits_graph + insert_space, 1, 3
) + pynini.closure(
pynutil.delete(",")
+ single_digits_graph
+ insert_space
+ single_digits_graph
+ insert_space
+ single_digits_graph,
1,
)
optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
graph = (
pynini.closure(NEMO_DIGIT, 1, 3)
+ (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3))
) @ graph
self.graph = graph
self.graph_with_and = self.add_optional_and(graph)
if deterministic:
long_numbers = pynini.compose(NEMO_DIGIT ** (5, ...), self.single_digits_graph).optimize()
final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize()
cardinal_with_leading_zeros = pynini.compose(
pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph
)
final_graph |= cardinal_with_leading_zeros
else:
leading_zeros = pynini.compose(pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
cardinal_with_leading_zeros = (
leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph_with_and)
)
# add small weight to non-default graphs to make sure the deterministic option is listed first
final_graph = (
self.graph_with_and
| pynutil.add_weight(self.single_digits_graph, 0.0001)
| get_four_digit_year_graph() # allows e.g. 4567 be pronouced as forty five sixty seven
| pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
| cardinal_with_leading_zeros
)
final_graph = optional_minus_graph + pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
def add_optional_and(self, graph):
graph_with_and = graph
if not self.lm:
graph_with_and = pynutil.add_weight(graph, 0.00001)
not_quote = pynini.closure(NEMO_NOT_QUOTE)
no_thousand_million = pynini.difference(
not_quote, not_quote + pynini.union("thousand", "million") + not_quote
).optimize()
integer = (
not_quote + pynutil.add_weight(pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001)
).optimize()
no_hundred = pynini.difference(NEMO_SIGMA, not_quote + pynini.accep("hundred") + not_quote).optimize()
integer |= (
not_quote + pynutil.add_weight(pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)
).optimize()
optional_hundred = pynini.compose((NEMO_DIGIT - "0") ** 3, graph).optimize()
optional_hundred = pynini.compose(optional_hundred, NEMO_SIGMA + pynini.cross(" hundred", "") + NEMO_SIGMA)
graph_with_and |= pynini.compose(graph, integer).optimize()
graph_with_and |= optional_hundred
return graph_with_and

View File

@@ -0,0 +1,370 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_LOWER,
NEMO_SIGMA,
NEMO_NOT_QUOTE,
TO_LOWER,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.examples import plurals
from pynini.lib import pynutil
graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize()
graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize()
year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv"))
year_suffix.extend(augment_labels_with_punct_at_end(year_suffix))
year_suffix = pynini.string_map(year_suffix).optimize()
def get_ties_graph(deterministic: bool = True):
"""
Returns two digit transducer, e.g.
03 -> o three
12 -> thirteen
20 -> twenty
"""
graph = graph_teen | ties_graph + pynutil.delete("0") | ties_graph + insert_space + graph_digit
if deterministic:
graph = graph | pynini.cross("0", "o") + insert_space + graph_digit
else:
graph = graph | (pynini.cross("0", "o") | pynini.cross("0", "zero")) + insert_space + graph_digit
return graph.optimize()
def get_four_digit_year_graph(deterministic: bool = True):
"""
Returns a four digit transducer which is combination of ties/teen or digits
(using hundred instead of thousand format), e.g.
1219 -> twelve nineteen
3900 -> thirty nine hundred
"""
graph_ties = get_ties_graph(deterministic)
graph_with_s = (
(graph_ties + insert_space + graph_ties)
| (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")))
) + pynutil.delete("0s")
graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s")
graph_with_s = graph_with_s @ pynini.cdrewrite(
pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA
)
graph = graph_ties + insert_space + graph_ties
graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred")
thousand_graph = (
graph_digit
+ insert_space
+ pynini.cross("00", "thousand")
+ (pynutil.delete("0") | insert_space + graph_digit)
)
thousand_graph |= (
graph_digit
+ insert_space
+ pynini.cross("000", "thousand")
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynini.accep("s")
)
graph |= graph_with_s
if deterministic:
graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA)
else:
graph |= thousand_graph
return graph.optimize()
def _get_two_digit_year_with_s_graph():
# to handle '70s -> seventies
graph = (
pynini.closure(pynutil.delete("'"), 0, 1)
+ pynini.compose(
ties_graph + pynutil.delete("0s"), pynini.cdrewrite(pynini.cross("y", "ies"), "", "[EOS]", NEMO_SIGMA)
)
).optimize()
return graph
def _get_year_graph(cardinal_graph, deterministic: bool = True):
"""
Transducer for year, only from 1000 - 2999 e.g.
1290 -> twelve nineteen
2000 - 2009 will be verbalized as two thousand.
Transducer for 3 digit year, e.g. 123-> one twenty three
Transducer for year with suffix
123 A.D., 4200 B.C
"""
graph = get_four_digit_year_graph(deterministic)
graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph
graph |= _get_two_digit_year_with_s_graph()
three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
year_with_suffix = (
(get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix
)
graph |= year_with_suffix
return graph.optimize()
def _get_two_digit_year(cardinal_graph, single_digits_graph):
wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
return wo_digit_year
class DateFst(GraphFst):
"""
Finite state transducer for classifying date, e.g.
jan. 5, 2012 -> date { month: "january" day: "five" year: "twenty twelve" preserve_order: true }
jan. 5 -> date { month: "january" day: "five" preserve_order: true }
5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" preserve_order: true }
2012-01-05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012.01.05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012/01/05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012 -> date { year: "twenty twelve" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
super().__init__(name="date", kind="classify", deterministic=deterministic)
# january
month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize()
# January, JANUARY
month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
TO_LOWER ** (2, ...), month_graph
)
# jan
month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize()
# jan, Jan, JAN
month_abbr_graph = (
month_abbr_graph
| pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
| pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
) + pynini.closure(pynutil.delete("."), 0, 1)
month_graph |= month_abbr_graph.optimize()
month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize()
cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit
year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic)
# three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
# year_graph |= three_digit_year
month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"")
endings = ["rd", "th", "st", "nd"]
endings += [x.upper() for x in endings]
endings = pynini.union(*endings)
day_graph = (
pynutil.insert("day: \"")
+ pynini.closure(pynutil.delete("the "), 0, 1)
+ (
((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
+ pynini.closure(pynutil.delete(endings), 0, 1)
)
@ cardinal_graph
+ pynutil.insert("\"")
)
two_digit_year = _get_two_digit_year(
cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph
)
two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")
# if lm:
# two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year)
# year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph)
# year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph)
graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
graph_year |= (
pynutil.insert(" year: \"")
+ pynini.accep(",")
+ pynini.closure(pynini.accep(" "), 0, 1)
+ year_graph
+ pynutil.insert("\"")
)
optional_graph_year = pynini.closure(graph_year, 0, 1)
year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
graph_mdy = month_graph + (
(delete_extra_space + day_graph)
| (pynini.accep(" ") + day_graph)
| graph_year
| (delete_extra_space + day_graph + graph_year)
)
graph_mdy |= (
month_graph
+ pynini.cross("-", " ")
+ day_graph
+ pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1)
)
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_mdy |= (
month_numbers_graph
+ delete_sep
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
+ delete_sep
+ insert_space
+ (year_graph | two_digit_year)
)
graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_dmy |= (
day_ex_month
+ delete_sep
+ insert_space
+ month_numbers_graph
+ delete_sep
+ insert_space
+ (year_graph | two_digit_year)
)
graph_ymd = pynini.accep("")
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_ymd |= (
(year_graph | two_digit_year)
+ delete_sep
+ insert_space
+ month_numbers_graph
+ delete_sep
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
)
final_graph = graph_mdy | graph_dmy
if not deterministic or lm:
final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
m_sep_d = (
month_numbers_graph
+ pynutil.delete(pynini.union("-", "/"))
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
)
final_graph |= m_sep_d
else:
final_graph += pynutil.insert(" preserve_order: true")
final_graph |= graph_ymd | year_graph
if not deterministic or lm:
ymd_to_mdy_graph = None
ymd_to_dmy_graph = None
mdy_to_dmy_graph = None
md_to_dm_graph = None
for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]:
for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]:
ymd_to_mdy_curr = (
pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
+ pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
)
# YY-MM-DD -> MM-DD-YY
ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
ymd_to_mdy_graph = (
ymd_to_mdy_curr
if ymd_to_mdy_graph is None
else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
)
ymd_to_dmy_curr = (
pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
+ pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
)
# YY-MM-DD -> MM-DD-YY
ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
ymd_to_dmy_graph = (
ymd_to_dmy_curr
if ymd_to_dmy_graph is None
else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
)
mdy_to_dmy_curr = (
pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
+ pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
).optimize()
# MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
mdy_to_dmy_graph = (
mdy_to_dmy_curr
if mdy_to_dmy_graph is None
else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
).optimize()
md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
"month: \"" + month + "\" day: \"" + day + "\""
)
md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()
md_to_dm_graph = (
md_to_dm_curr
if md_to_dm_graph is None
else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
).optimize()
final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,129 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path
from pynini.lib import pynutil
delete_space = pynutil.delete(" ")
quantities = pynini.string_file(get_abs_path("data/number/thousand.tsv"))
quantities_abbr = pynini.string_file(get_abs_path("data/number/quantity_abbr.tsv"))
quantities_abbr |= TO_UPPER @ quantities_abbr
def get_quantity(
decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike', include_abbr: bool
) -> 'pynini.FstLike':
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. 1 million -> integer_part: "one" quantity: "million"
e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
quantity_wo_thousand = pynini.project(quantities, "input") - pynini.union("k", "K", "thousand")
if include_abbr:
quantity_wo_thousand |= pynini.project(quantities_abbr, "input") - pynini.union("k", "K", "thousand")
res = (
pynutil.insert("integer_part: \"")
+ cardinal_up_to_hundred
+ pynutil.insert("\"")
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynutil.insert(" quantity: \"")
+ (quantity_wo_thousand @ (quantities | quantities_abbr))
+ pynutil.insert("\"")
)
if include_abbr:
quantity = quantities | quantities_abbr
else:
quantity = quantities
res |= (
decimal
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynutil.insert("quantity: \"")
+ quantity
+ pynutil.insert("\"")
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal, e.g.
-12.5006 billion -> decimal { negative: "true" integer_part: "12" fractional_part: "five o o six" quantity: "billion" }
1 billion -> decimal { integer_part: "one" quantity: "billion" }
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst, deterministic: bool):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and
cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
self.graph = cardinal.single_digits_graph.optimize()
if not deterministic:
self.graph = self.graph | cardinal_graph
point = pynutil.delete(".")
optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
self.graph_fractional = pynutil.insert("fractional_part: \"") + self.graph + pynutil.insert("\"")
self.graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
final_graph_wo_sign = (
pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1)
+ point
+ pynutil.insert(" ")
+ self.graph_fractional
)
quantity_w_abbr = get_quantity(
final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=True
)
quantity_wo_abbr = get_quantity(
final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=False
)
self.final_graph_wo_negative_w_abbr = final_graph_wo_sign | quantity_w_abbr
self.final_graph_wo_negative = final_graph_wo_sign | quantity_wo_abbr
# reduce options for non_deterministic and allow either "oh" or "zero", but not combination
if not deterministic:
no_oh_zero = pynini.difference(
NEMO_SIGMA,
(NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA)
| (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA),
).optimize()
no_zero_oh = pynini.difference(
NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA
).optimize()
self.final_graph_wo_negative |= pynini.compose(
self.final_graph_wo_negative,
pynini.cdrewrite(
pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA
),
)
self.final_graph_wo_negative = pynini.compose(self.final_graph_wo_negative, no_oh_zero).optimize()
self.final_graph_wo_negative = pynini.compose(self.final_graph_wo_negative, no_zero_oh).optimize()
final_graph = optional_graph_negative + self.final_graph_wo_negative
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,87 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
GraphFst,
get_abs_path,
insert_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. cdf1@abc.edu -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
accepted_common_domains = pynini.project(
pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
)
all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
username = (
pynutil.insert("username: \"") + all_accepted_symbols + pynutil.insert("\"") + pynini.cross('@', ' ')
)
domain_graph = all_accepted_symbols + pynini.accep('.') + all_accepted_symbols + NEMO_ALPHA
protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "semicolon")) + pynutil.insert(" "))
protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + (
pynini.accep("://") @ protocol_symbols
)
protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols)
protocol_end = pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols
protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end)
domain_graph = (
pynutil.insert("domain: \"")
+ pynini.difference(domain_graph, pynini.project(protocol, "input") + NEMO_SIGMA)
+ pynutil.insert("\"")
)
domain_common_graph = (
pynutil.insert("domain: \"")
+ pynini.difference(
all_accepted_symbols
+ accepted_common_domains
+ pynini.closure(accepted_symbols + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols), 0, 1),
pynini.project(protocol, "input") + NEMO_SIGMA,
)
+ pynutil.insert("\"")
)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
# email
graph = username + domain_graph
# abc.com, abc.com/123-sm
graph |= domain_common_graph
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
graph |= protocol + pynutil.insert(" ") + domain_graph
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,55 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, get_abs_path
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
"23 4/5" ->
tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } }
"23 4/5th" ->
tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph
integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
numerator = (
pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))
)
endings = ["rd", "th", "st", "nd"]
endings += [x.upper() for x in endings]
optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1)
denominator = pynutil.insert("denominator: \"") + cardinal_graph + optional_end + pynutil.insert("\"")
graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator)
graph |= pynini.closure(integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)
)
self.graph = graph
final_graph = self.add_tokens(self.graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,304 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NON_BREAKING_SPACE,
NEMO_SIGMA,
NEMO_SPACE,
NEMO_UPPER,
SINGULAR_TO_PLURAL,
TO_LOWER,
GraphFst,
convert_space,
delete_space,
delete_zero_or_one_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst as OrdinalTagger
from nemo_text_processing.text_normalization.en.taggers.whitelist import get_formats
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as OrdinalVerbalizer
from pynini.examples import plurals
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
-12kg -> measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" }
1kg -> measure { cardinal { integer: "one" } units: "kilogram" }
.5kg -> measure { decimal { fractional_part: "five" } units: "kilograms" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
fraction: FractionFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True):
super().__init__(name="measure", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and)
graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
if not deterministic:
graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv"))
graph_unit |= pynini.compose(
pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit
).optimize()
graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
graph_unit = convert_space(graph_unit)
optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
graph_unit2 = (
pynini.cross("/", "per") + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit
)
optional_graph_unit2 = pynini.closure(
delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1,
)
unit_plural = (
pynutil.insert("units: \"")
+ (graph_unit_plural + optional_graph_unit2 | graph_unit2)
+ pynutil.insert("\"")
)
unit_singular = (
pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal.final_graph_wo_negative
+ delete_space
+ pynutil.insert(" } ")
+ unit_plural
)
# support radio FM/AM
subgraph_decimal |= (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ delete_space
+ pynutil.insert(" } ")
+ pynutil.insert("units: \"")
+ pynini.union("AM", "FM")
+ pynutil.insert("\"")
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ ((NEMO_SIGMA - "1") @ cardinal_graph)
+ delete_space
+ pynutil.insert("\"")
+ pynutil.insert(" } ")
+ unit_plural
)
subgraph_cardinal |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ pynini.cross("1", "one")
+ delete_space
+ pynutil.insert("\"")
+ pynutil.insert(" } ")
+ unit_singular
)
unit_graph = (
pynutil.insert("cardinal { integer: \"-\" } units: \"")
+ pynini.cross(pynini.union("/", "per"), "per")
+ delete_zero_or_one_space
+ pynutil.insert(NEMO_NON_BREAKING_SPACE)
+ graph_unit
+ pynutil.insert("\" preserve_order: true")
)
decimal_dash_alpha = (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ pynini.cross('-', '')
+ pynutil.insert(" } units: \"")
+ pynini.closure(NEMO_ALPHA, 1)
+ pynutil.insert("\"")
)
decimal_times = (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ pynutil.insert(" } units: \"")
+ pynini.cross(pynini.union('x', "X"), 'x')
+ pynutil.insert("\"")
)
alpha_dash_decimal = (
pynutil.insert("units: \"")
+ pynini.closure(NEMO_ALPHA, 1)
+ pynini.accep('-')
+ pynutil.insert("\"")
+ pynutil.insert(" decimal { ")
+ decimal.final_graph_wo_negative
+ pynutil.insert(" } preserve_order: true")
)
subgraph_fraction = (
pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural
)
address = self.get_address_graph(cardinal)
address = (
pynutil.insert("units: \"address\" cardinal { integer: \"")
+ address
+ pynutil.insert("\" } preserve_order: true")
)
math_operations = pynini.string_file(get_abs_path("data/measure/math_operation.tsv"))
delimiter = pynini.accep(" ") | pynutil.insert(" ")
math = (
(cardinal_graph | NEMO_ALPHA)
+ delimiter
+ math_operations
+ (delimiter | NEMO_ALPHA)
+ cardinal_graph
+ delimiter
+ pynini.cross("=", "equals")
+ delimiter
+ (cardinal_graph | NEMO_ALPHA)
)
math |= (
(cardinal_graph | NEMO_ALPHA)
+ delimiter
+ pynini.cross("=", "equals")
+ delimiter
+ (cardinal_graph | NEMO_ALPHA)
+ delimiter
+ math_operations
+ delimiter
+ cardinal_graph
)
math = (
pynutil.insert("units: \"math\" cardinal { integer: \"")
+ math
+ pynutil.insert("\" } preserve_order: true")
)
final_graph = (
subgraph_decimal
| subgraph_cardinal
| unit_graph
| decimal_dash_alpha
| decimal_times
| alpha_dash_decimal
| subgraph_fraction
| address
| math
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
def get_range(self, cardinal: GraphFst):
"""
Returns range forms for measure tagger, e.g. 2-3, 2x3, 2*2
Args:
cardinal: cardinal GraphFst
"""
range_graph = cardinal + pynini.cross(pynini.union("-", " - "), " to ") + cardinal
for x in [" x ", "x"]:
range_graph |= cardinal + pynini.cross(x, " by ") + cardinal
if not self.deterministic:
range_graph |= cardinal + pynini.cross(x, " times ") + cardinal
for x in ["*", " * "]:
range_graph |= cardinal + pynini.cross(x, " times ") + cardinal
return range_graph.optimize()
def get_address_graph(self, cardinal):
"""
Finite state transducer for classifying serial.
The serial is a combination of digits, letters and dashes, e.g.:
2788 San Tomas Expy, Santa Clara, CA 95051 ->
units: "address" cardinal
{ integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
preserve_order: true
"""
ordinal_verbalizer = OrdinalVerbalizer().graph
ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
ordinal_num = pynini.compose(
pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer
)
address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
address_num += insert_space + NEMO_DIGIT ** 2 @ (
pynini.closure(pynini.cross("0", "zero "), 0, 1)
+ cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
# to handle the rest of the numbers
address_num = pynini.compose(NEMO_DIGIT ** (3, 4), address_num)
address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA)
direction = (
pynini.cross("E", "East")
| pynini.cross("S", "South")
| pynini.cross("W", "West")
| pynini.cross("N", "North")
) + pynini.closure(pynutil.delete("."), 0, 1)
direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
address_words = get_formats(get_abs_path("data/address/address_word.tsv"))
address_words = (
pynini.accep(NEMO_SPACE)
+ (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1))
+ NEMO_SPACE
+ pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE)
+ address_words
)
city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
city = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)
states = load_labels(get_abs_path("data/address/state.tsv"))
additional_options = []
for x, y in states:
additional_options.append((x, f"{y[0]}.{y[1:]}"))
states.extend(additional_options)
state_graph = pynini.string_map(states)
state = pynini.invert(state_graph)
state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)
zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph)
zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,)
address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1)
address |= address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1)
return address

View File

@@ -0,0 +1,192 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
SINGULAR_TO_PLURAL,
GraphFst,
convert_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.lib import pynutil
min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv"))
min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv"))
maj_singular = pynini.string_file((get_abs_path("data/money/currency_major.tsv")))
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money, suppletive aware, e.g.
$12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
$12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
$1 -> money { currency_maj: "dollar" integer_part: "one" }
$1.00 -> money { currency_maj: "dollar" integer_part: "one" }
$0.05 -> money { fractional_part: "five" currency_min: "cents" preserve_order: true }
$1 million -> money { currency_maj: "dollars" integer_part: "one" quantity: "million" }
$1.2 million -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two" quantity: "million" }
$1.2320 -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two three two" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and
graph_decimal_final = decimal.final_graph_wo_negative_w_abbr
maj_singular_labels = load_labels(get_abs_path("data/money/currency_major.tsv"))
maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
maj_unit_singular = convert_space(maj_singular)
graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")
optional_delete_fractional_zeros = pynini.closure(
pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1
)
graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")
# only for decimals where third decimal after comma is non-zero or with quantity
decimal_delete_last_zeros = (
pynini.closure(NEMO_DIGIT | pynutil.delete(","))
+ pynini.accep(".")
+ pynini.closure(NEMO_DIGIT, 2)
+ (NEMO_DIGIT - "0")
+ pynini.closure(pynutil.delete("0"))
)
decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA
graph_decimal = (
graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
)
graph_integer = (
pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")
)
graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
graph_integer_only |= graph_maj_plural + insert_space + graph_integer
final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal
# remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
# e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
# not accepted: 002, 00, 0,
two_digits_fractional_part = (
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))
) @ (
(pynutil.delete("0") + (NEMO_DIGIT - "0"))
| ((NEMO_DIGIT - "0") + pynutil.insert("0"))
| ((NEMO_DIGIT - "0") + NEMO_DIGIT)
)
graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"")
graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"")
# format ** dollars ** cent
decimal_graph_with_minor = None
integer_graph_reordered = None
decimal_default_reordered = None
for curr_symbol, _ in maj_singular_labels:
preserve_order = pynutil.insert(" preserve_order: true")
integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural
integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular
integer_plus_maj_with_comma = pynini.compose(
NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj
)
integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
integer_plus_maj |= integer_plus_maj_with_comma
graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one")
graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"")
graph_fractional = (
two_digits_fractional_part
@ (pynini.closure(NEMO_DIGIT, 1, 2) - "1")
@ cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"")
fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural
fractional_plus_min |= (
graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular
)
decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min
if not deterministic:
decimal_graph_with_minor_curr |= pynutil.add_weight(
integer_plus_maj
+ pynini.cross(".", " ")
+ pynutil.insert("fractional_part: \"")
+ two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
+ pynutil.insert("\""),
weight=0.0001,
)
default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
decimal_graph_with_minor_curr |= (
pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min
)
decimal_graph_with_minor_curr = (
pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order
)
decimal_graph_with_minor = (
decimal_graph_with_minor_curr
if decimal_graph_with_minor is None
else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()
)
if not deterministic:
integer_graph_reordered_curr = (
pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order
).optimize()
integer_graph_reordered = (
integer_graph_reordered_curr
if integer_graph_reordered is None
else pynini.union(integer_graph_reordered, integer_graph_reordered_curr).optimize()
)
decimal_default_reordered_curr = (
pynutil.delete(curr_symbol)
+ default_fraction_graph
+ insert_space
+ pynutil.insert(curr_symbol) @ graph_maj_plural
)
decimal_default_reordered = (
decimal_default_reordered_curr
if decimal_default_reordered is None
else pynini.union(decimal_default_reordered, decimal_default_reordered_curr)
).optimize()
# weight for SH
final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)
if not deterministic:
final_graph |= integer_graph_reordered | decimal_default_reordered
# to handle "$2.00" cases
final_graph |= pynini.compose(
NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered
)
final_graph = self.add_tokens(final_graph.optimize())
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,61 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal, e.g.
13th -> ordinal { integer: "thirteen" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph
cardinal_format = pynini.closure(NEMO_DIGIT | pynini.accep(","))
st_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("1")
+ pynutil.delete(pynini.union("st", "ST"))
)
nd_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("2")
+ pynutil.delete(pynini.union("nd", "ND"))
)
rd_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("3")
+ pynutil.delete(pynini.union("rd", "RD"))
)
th_format = pynini.closure(
(NEMO_DIGIT - "1" - "2" - "3")
| (cardinal_format + "1" + NEMO_DIGIT)
| (cardinal_format + (NEMO_DIGIT - "1") + (NEMO_DIGIT - "1" - "2" - "3")),
1,
) + pynutil.delete(pynini.union("th", "TH"))
self.graph = (st_format | nd_format | rd_format | th_format) @ cardinal_graph
final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,65 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from unicodedata import category
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.examples import plurals
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="punctuation", kind="classify", deterministic=deterministic)
s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""
punct_symbols_to_exclude = ["[", "]"]
punct_unicode = [
chr(i)
for i in range(sys.maxunicode)
if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude
]
whitelist_symbols = load_labels(get_abs_path("data/whitelist/symbol.tsv"))
whitelist_symbols = [x[0] for x in whitelist_symbols]
self.punct_marks = [p for p in punct_unicode + list(s) if p not in whitelist_symbols]
punct = pynini.union(*self.punct_marks)
punct = pynini.closure(punct, 1)
emphasis = (
pynini.accep("<")
+ (
(pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1))
| (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))
)
+ pynini.accep(">")
)
punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)
self.graph = punct
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()

View File

@@ -0,0 +1,102 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space
from pynini.lib import pynutil
class RangeFst(GraphFst):
"""
This class is a composite class of two other class instances
Args:
time: composed tagger and verbalizer
date: composed tagger and verbalizer
cardinal: tagger
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
lm: whether to use for hybrid LM
"""
def __init__(
self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False,
):
super().__init__(name="range", kind="classify", deterministic=deterministic)
delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
approx = pynini.cross("~", "approximately")
# TIME
time_graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time
self.graph = time_graph | (approx + time)
cardinal = cardinal.graph_with_and
# YEAR
date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
year_to_year_graph = (
date_year_four_digit
+ delete_space
+ pynini.cross("-", " to ")
+ delete_space
+ (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal))
)
mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit)
self.graph |= year_to_year_graph
self.graph |= mid_year_graph
# ADDITION
range_graph = cardinal + pynini.closure(pynini.cross("+", " plus ") + cardinal, 1)
range_graph |= cardinal + pynini.closure(pynini.cross(" + ", " plus ") + cardinal, 1)
range_graph |= approx + cardinal
range_graph |= cardinal + (pynini.cross("...", " ... ") | pynini.accep(" ... ")) + cardinal
if not deterministic or lm:
# cardinal ----
cardinal_to_cardinal_graph = (
cardinal + delete_space + pynini.cross("-", pynini.union(" to ", " minus ")) + delete_space + cardinal
)
range_graph |= cardinal_to_cardinal_graph | (
cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal
)
# MULTIPLY
for x in [" x ", "x"]:
range_graph |= cardinal + pynini.closure(
pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1
)
for x in ["*", " * "]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, " times ") + cardinal, 1)
# supports "No. 12" -> "Number 12"
range_graph |= (
(pynini.cross(pynini.union("NO", "No"), "Number") | pynini.cross("no", "number"))
+ pynini.closure(pynini.union(". ", " "), 0, 1)
+ cardinal
)
for x in ["/", " / "]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, " divided by ") + cardinal, 1)
self.graph |= range_graph
self.graph = self.graph.optimize()
graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"")
self.fst = graph.optimize()

View File

@@ -0,0 +1,114 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_SIGMA, GraphFst
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.lib import pynutil
class RomanFst(GraphFst):
"""
Finite state transducer for classifying roman numbers:
e.g. "IV" -> tokens { roman { integer: "four" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="roman", kind="classify", deterministic=deterministic)
roman_dict = load_labels(get_abs_path("data/roman/roman_to_spoken.tsv"))
default_graph = pynini.string_map(roman_dict).optimize()
default_graph = pynutil.insert("integer: \"") + default_graph + pynutil.insert("\"")
ordinal_limit = 19
if deterministic:
# exclude "I"
start_idx = 1
else:
start_idx = 0
graph_teens = pynini.string_map([x[0] for x in roman_dict[start_idx:ordinal_limit]]).optimize()
# roman numerals up to ordinal_limit with a preceding name are converted to ordinal form
names = get_names()
graph = (
pynutil.insert("key_the_ordinal: \"")
+ names
+ pynutil.insert("\"")
+ pynini.accep(" ")
+ graph_teens @ default_graph
).optimize()
# single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form
key_words = []
for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")):
key_words.append(k_word)
key_words.append([k_word[0][0].upper() + k_word[0][1:]])
key_words.append([k_word[0].upper()])
key_words = pynini.string_map(key_words).optimize()
graph |= (
pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph
).optimize()
if deterministic or lm:
# two digit roman numerals up to 49
roman_to_cardinal = pynini.compose(
pynini.closure(NEMO_ALPHA, 2),
(
pynutil.insert("default_cardinal: \"default\" ")
+ (pynini.string_map([x[0] for x in roman_dict[:50]]).optimize()) @ default_graph
),
)
graph |= roman_to_cardinal
elif not lm:
# two or more digit roman numerals
roman_to_cardinal = pynini.compose(
pynini.difference(NEMO_SIGMA, "I"),
(
pynutil.insert("default_cardinal: \"default\" integer: \"")
+ pynini.string_map(roman_dict).optimize()
+ pynutil.insert("\"")
),
).optimize()
graph |= roman_to_cardinal
# convert three digit roman or up with suffix to ordinal
roman_to_ordinal = pynini.compose(
pynini.closure(NEMO_ALPHA, 3),
(pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")),
)
graph |= roman_to_ordinal
graph = self.add_tokens(graph.optimize())
self.fst = graph.optimize()
def get_names():
"""
Returns the graph that matched common male and female names.
"""
male_labels = load_labels(get_abs_path("data/roman/male.tsv"))
female_labels = load_labels(get_abs_path("data/roman/female.tsv"))
male_labels.extend([[x[0].upper()] for x in male_labels])
female_labels.extend([[x[0].upper()] for x in female_labels])
names = pynini.string_map(male_labels).optimize()
names |= pynini.string_map(female_labels).optimize()
return names

View File

@@ -0,0 +1,136 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
GraphFst,
convert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.examples import plurals
from pynini.lib import pynutil
class SerialFst(GraphFst):
"""
This class is a composite class of two other class instances
Args:
time: composed tagger and verbalizer
date: composed tagger and verbalizer
cardinal: tagger
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
lm: whether to use for hybrid LM
"""
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
super().__init__(name="integer", kind="classify", deterministic=deterministic)
"""
Finite state transducer for classifying serial (handles only cases without delimiters,
values with delimiters are handled by default).
The serial is a combination of digits, letters and dashes, e.g.:
c325b -> tokens { cardinal { integer: "c three two five b" } }
"""
num_graph = pynini.compose(NEMO_DIGIT ** (6, ...), cardinal.single_digits_graph).optimize()
num_graph |= pynini.compose(NEMO_DIGIT ** (1, 5), cardinal.graph).optimize()
# to handle numbers starting with zero
num_graph |= pynini.compose(
pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph
).optimize()
# TODO: "#" doesn't work from the file
symbols_graph = pynini.string_file(get_abs_path("data/whitelist/symbol.tsv")).optimize() | pynini.cross(
"#", "hash"
)
num_graph |= symbols_graph
if not self.deterministic and not lm:
num_graph |= cardinal.single_digits_graph
# also allow double digits to be pronounced as integer in serial number
num_graph |= pynutil.add_weight(
NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001
)
# add space between letter and digit/symbol
symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))]
symbols = pynini.union(*symbols)
digit_symbol = NEMO_DIGIT | symbols
graph_with_space = pynini.compose(
pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA),
pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA),
)
# serial graph with delimiter
delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
if not deterministic:
delimiter |= pynini.cross("-", " dash ") | pynini.cross("/", " slash ")
alphas = pynini.closure(NEMO_ALPHA, 1)
letter_num = alphas + delimiter + num_graph
num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
next_alpha_or_num |= pynini.closure(
delimiter
+ num_graph
+ plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize()
+ alphas
)
serial_graph = letter_num + next_alpha_or_num
serial_graph |= num_letter + next_alpha_or_num
# numbers only with 2+ delimiters
serial_graph |= (
num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)
)
# 2+ symbols
serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph)
# exclude ordinal numbers from serial options
serial_graph = pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph
).optimize()
serial_graph = pynutil.add_weight(serial_graph, 0.0001)
serial_graph |= (
pynini.closure(NEMO_NOT_SPACE, 1)
+ (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()
)
# at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
serial_graph = (
pynini.closure((serial_graph | num_graph | alphas) + delimiter)
+ serial_graph
+ pynini.closure(delimiter + (serial_graph | num_graph | alphas))
)
serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize()
serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize()
# this is not to verbolize "/" as "slash" in cases like "import/export"
serial_graph = pynini.compose(
pynini.difference(
NEMO_SIGMA, pynini.closure(NEMO_ALPHA, 1) + pynini.accep("/") + pynini.closure(NEMO_ALPHA, 1)
),
serial_graph,
)
self.graph = serial_graph.optimize()
graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"")
self.fst = graph.optimize()

View File

@@ -0,0 +1,133 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
plurals,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension
country code optional: +***
number part: ***-***-****, or (***) ***-****
extension optional: 1-9999
E.g
+1 123-123-5678-1 -> telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" }
1-800-GO-U-HAUL -> telephone { country_code: "one" number_part: "one, eight hundred GO U HAUL" }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
add_separator = pynutil.insert(", ") # between components
zero = pynini.cross("0", "zero")
if not deterministic:
zero |= pynini.cross("0", pynini.union("o", "oh"))
digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() | zero
telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv"))
country_code = (
pynini.closure(telephone_prompts + delete_extra_space, 0, 1)
+ pynini.closure(pynini.cross("+", "plus "), 0, 1)
+ pynini.closure(digit + insert_space, 0, 2)
+ digit
+ pynutil.insert(",")
)
country_code |= telephone_prompts
country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"")
country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
area_part = pynini.cross("800", "eight hundred") | pynini.compose(
pynini.difference(NEMO_SIGMA, "800"), area_part_default
)
area_part = (
(area_part + (pynutil.delete("-") | pynutil.delete(".")))
| (
pynutil.delete("(")
+ area_part
+ ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
)
) + add_separator
del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1)
number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator)) ** 7
number_words = pynini.closure(
(NEMO_DIGIT @ digit) + (insert_space | (pynini.cross("-", ', ')))
| NEMO_ALPHA
| (NEMO_ALPHA + pynini.cross("-", ' '))
)
number_words |= pynini.closure(
(NEMO_DIGIT @ digit) + (insert_space | (pynini.cross(".", ', ')))
| NEMO_ALPHA
| (NEMO_ALPHA + pynini.cross(".", ' '))
)
number_words = pynini.compose(number_length, number_words)
number_part = area_part + number_words
number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"")
extension = (
pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")
)
extension = pynini.closure(insert_space + extension, 0, 1)
graph = plurals._priority_union(country_code + number_part, number_part, NEMO_SIGMA).optimize()
graph = plurals._priority_union(country_code + number_part + extension, graph, NEMO_SIGMA).optimize()
graph = plurals._priority_union(number_part + extension, graph, NEMO_SIGMA).optimize()
# ip
ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv"))
digit_to_str_graph = digit + pynini.closure(pynutil.insert(" ") + digit, 0, 2)
ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph) ** 3
graph |= (
pynini.closure(
pynutil.insert("country_code: \"") + ip_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
)
+ pynutil.insert("number_part: \"")
+ ip_graph.optimize()
+ pynutil.insert("\"")
)
# ssn
ssn_prompts = pynini.string_file(get_abs_path("data/telephone/ssn_prompt.tsv"))
three_digit_part = digit + (pynutil.insert(" ") + digit) ** 2
two_digit_part = digit + pynutil.insert(" ") + digit
four_digit_part = digit + (pynutil.insert(" ") + digit) ** 3
ssn_separator = pynini.cross("-", ", ")
ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part
graph |= (
pynini.closure(
pynutil.insert("country_code: \"") + ssn_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
)
+ pynutil.insert("number_part: \"")
+ ssn_graph.optimize()
+ pynutil.insert("\"")
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,132 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
GraphFst,
convert_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time, e.g.
12:30 a.m. est -> time { hours: "twelve" minutes: "thirty" suffix: "a m" zone: "e s t" }
2.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
02.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
2.00 a.m. -> time { hours: "two" suffix: "a m" }
2 a.m. -> time { hours: "two" suffix: "a m" }
02:00 -> time { hours: "two" }
2:00 -> time { hours: "two" }
10:00:05 a.m. -> time { hours: "ten" minutes: "zero" seconds: "five" suffix: "a m" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="time", kind="classify", deterministic=deterministic)
suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv"))
suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels))
suffix_graph = pynini.string_map(suffix_labels)
time_zone_graph = pynini.string_file(get_abs_path("data/time/zone.tsv"))
# only used for < 1000 thousand -> 0 weight
cardinal = cardinal.graph
labels_hour = [str(x) for x in range(0, 24)]
labels_minute_single = [str(x) for x in range(1, 10)]
labels_minute_double = [str(x) for x in range(10, 60)]
delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT
)
graph_hour = delete_leading_zero_to_double_digit @ pynini.union(*labels_hour) @ cardinal
graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"")
final_graph_minute = (
pynutil.insert("minutes: \"")
+ (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double)
+ pynutil.insert("\"")
)
final_graph_second = (
pynutil.insert("seconds: \"")
+ (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double)
+ pynutil.insert("\"")
)
final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"")
final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1)
final_time_zone_optional = pynini.closure(
delete_space
+ insert_space
+ pynutil.insert("zone: \"")
+ convert_space(time_zone_graph)
+ pynutil.insert("\""),
0,
1,
)
# 2:30 pm, 02:30, 2:00
graph_hm = (
final_graph_hour
+ pynutil.delete(":")
+ (pynutil.delete("00") | insert_space + final_graph_minute)
+ final_suffix_optional
+ final_time_zone_optional
)
# 10:30:05 pm,
graph_hms = (
final_graph_hour
+ pynutil.delete(":")
+ (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute)
+ pynutil.delete(":")
+ (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second)
+ final_suffix_optional
+ final_time_zone_optional
)
# 2.xx pm/am
graph_hm2 = (
final_graph_hour
+ pynutil.delete(".")
+ (pynutil.delete("00") | insert_space + final_graph_minute)
+ delete_space
+ insert_space
+ final_suffix
+ final_time_zone_optional
)
# 2 pm est
graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,201 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.abbreviation import AbbreviationFst
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDateFst
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinalFst
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTimeFst
from pynini.lib import pynutil
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = False,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
else:
start_time = time.time()
cardinal = CardinalFst(deterministic=deterministic)
cardinal_graph = cardinal.fst
start_time = time.time()
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
ordinal_graph = ordinal.fst
start_time = time.time()
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst
start_time = time.time()
fraction = FractionFst(deterministic=deterministic, cardinal=cardinal)
fraction_graph = fraction.fst
start_time = time.time()
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic)
measure_graph = measure.fst
start_time = time.time()
date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst
start_time = time.time()
time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst
start_time = time.time()
telephone_graph = TelephoneFst(deterministic=deterministic).fst
start_time = time.time()
electonic_graph = ElectronicFst(deterministic=deterministic).fst
start_time = time.time()
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst
start_time = time.time()
whitelist_graph = WhiteListFst(
input_case=input_case, deterministic=deterministic, input_file=whitelist
).fst
start_time = time.time()
punctuation = PunctuationFst(deterministic=deterministic)
punct_graph = punctuation.fst
start_time = time.time()
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst
start_time = time.time()
serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic).fst
start_time = time.time()
v_time_graph = vTimeFst(deterministic=deterministic).fst
v_ordinal_graph = vOrdinalFst(deterministic=deterministic)
v_date_graph = vDateFst(ordinal=v_ordinal_graph, deterministic=deterministic).fst
time_final = pynini.compose(time_graph, v_time_graph)
date_final = pynini.compose(date_graph, v_date_graph)
range_graph = RangeFst(
time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic
).fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electonic_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(range_graph, 1.1)
| pynutil.add_weight(serial_graph, 1.1001) # should be higher than the rest of the classes
)
roman_graph = RomanFst(deterministic=deterministic).fst
classify |= pynutil.add_weight(roman_graph, 1.1)
if not deterministic:
abbreviation_graph = AbbreviationFst(deterministic=deterministic).fst
classify |= pynutil.add_weight(abbreviation_graph, 100)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct),
1,
)
classify |= pynutil.add_weight(word_graph, 100)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph = delete_space + graph + delete_space
graph |= punct
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})

View File

@@ -0,0 +1,228 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst as vCardinal
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDate
from nemo_text_processing.text_normalization.en.verbalizers.decimal import DecimalFst as vDecimal
from nemo_text_processing.text_normalization.en.verbalizers.electronic import ElectronicFst as vElectronic
from nemo_text_processing.text_normalization.en.verbalizers.fraction import FractionFst as vFraction
from nemo_text_processing.text_normalization.en.verbalizers.measure import MeasureFst as vMeasure
from nemo_text_processing.text_normalization.en.verbalizers.money import MoneyFst as vMoney
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinal
from nemo_text_processing.text_normalization.en.verbalizers.roman import RomanFst as vRoman
from nemo_text_processing.text_normalization.en.verbalizers.telephone import TelephoneFst as vTelephone
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord
from pynini.examples import plurals
from pynini.lib import pynutil
from nemo.utils import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = True,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != 'None':
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}_lm.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
else:
logging.info(f'Creating ClassifyFst grammars. This might take some time...')
# TAGGERS
cardinal = CardinalFst(deterministic=True, lm=True)
cardinal_tagger = cardinal
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal=cardinal, deterministic=True)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal=cardinal, deterministic=True)
decimal_graph = decimal.fst
fraction = FractionFst(deterministic=True, cardinal=cardinal)
fraction_graph = fraction.fst
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=True)
measure_graph = measure.fst
date = DateFst(cardinal=cardinal, deterministic=True, lm=True)
date_graph = date.fst
punctuation = PunctuationFst(deterministic=True)
punct_graph = punctuation.graph
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
time_graph = TimeFst(cardinal=cardinal, deterministic=True).fst
telephone_graph = TelephoneFst(deterministic=True).fst
electronic_graph = ElectronicFst(deterministic=True).fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=False).fst
whitelist = WhiteListFst(input_case=input_case, deterministic=False, input_file=whitelist)
whitelist_graph = whitelist.graph
serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic, lm=True).fst
# VERBALIZERS
cardinal = vCardinal(deterministic=True)
v_cardinal_graph = cardinal.fst
decimal = vDecimal(cardinal=cardinal, deterministic=True)
v_decimal_graph = decimal.fst
ordinal = vOrdinal(deterministic=True)
v_ordinal_graph = ordinal.fst
fraction = vFraction(deterministic=True, lm=True)
v_fraction_graph = fraction.fst
v_telephone_graph = vTelephone(deterministic=True).fst
v_electronic_graph = vElectronic(deterministic=True).fst
measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=False)
v_measure_graph = measure.fst
v_time_graph = vTime(deterministic=True).fst
v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic, lm=True).fst
v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst
v_roman_graph = vRoman(deterministic=deterministic).fst
v_word_graph = vWord(deterministic=deterministic).fst
cardinal_or_date_final = plurals._priority_union(date_graph, cardinal_graph, NEMO_SIGMA)
cardinal_or_date_final = pynini.compose(cardinal_or_date_final, (v_cardinal_graph | v_date_graph))
time_final = pynini.compose(time_graph, v_time_graph)
ordinal_final = pynini.compose(ordinal_graph, v_ordinal_graph)
sem_w = 1
word_w = 100
punct_w = 2
classify_and_verbalize = (
pynutil.add_weight(time_final, sem_w)
| pynutil.add_weight(pynini.compose(decimal_graph, v_decimal_graph), sem_w)
| pynutil.add_weight(pynini.compose(measure_graph, v_measure_graph), sem_w)
| pynutil.add_weight(ordinal_final, sem_w)
| pynutil.add_weight(pynini.compose(telephone_graph, v_telephone_graph), sem_w)
| pynutil.add_weight(pynini.compose(electronic_graph, v_electronic_graph), sem_w)
| pynutil.add_weight(pynini.compose(fraction_graph, v_fraction_graph), sem_w)
| pynutil.add_weight(pynini.compose(money_graph, v_money_graph), sem_w)
| pynutil.add_weight(cardinal_or_date_final, sem_w)
| pynutil.add_weight(whitelist_graph, sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_word_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()
roman_graph = RomanFst(deterministic=deterministic, lm=True).fst
# the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
classify_and_verbalize |= pynutil.add_weight(pynini.compose(roman_graph, v_roman_graph), sem_w)
date_final = pynini.compose(date_graph, v_date_graph)
range_graph = RangeFst(
time=time_final, cardinal=cardinal_tagger, date=date_final, deterministic=deterministic
).fst
classify_and_verbalize |= pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w)
classify_and_verbalize = pynutil.insert("< ") + classify_and_verbalize + pynutil.insert(" >")
classify_and_verbalize |= pynutil.add_weight(word_graph, word_w)
punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct_only),
1,
)
def get_token_sem_graph(classify_and_verbalize):
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ classify_and_verbalize
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph |= punct_only + pynini.closure(punct)
graph = delete_space + graph + delete_space
remove_extra_spaces = pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(
delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)
)
remove_extra_spaces |= (
pynini.closure(pynutil.delete(" "), 1)
+ pynini.closure(NEMO_NOT_SPACE, 1)
+ pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))
)
graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize()
return graph
self.fst = get_token_sem_graph(classify_and_verbalize)
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f'ClassifyFst grammars are saved to {far_file}.')

View File

@@ -0,0 +1,229 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.abbreviation import AbbreviationFst
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.abbreviation import AbbreviationFst as vAbbreviation
from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst as vCardinal
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDate
from nemo_text_processing.text_normalization.en.verbalizers.decimal import DecimalFst as vDecimal
from nemo_text_processing.text_normalization.en.verbalizers.electronic import ElectronicFst as vElectronic
from nemo_text_processing.text_normalization.en.verbalizers.fraction import FractionFst as vFraction
from nemo_text_processing.text_normalization.en.verbalizers.measure import MeasureFst as vMeasure
from nemo_text_processing.text_normalization.en.verbalizers.money import MoneyFst as vMoney
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinal
from nemo_text_processing.text_normalization.en.verbalizers.roman import RomanFst as vRoman
from nemo_text_processing.text_normalization.en.verbalizers.telephone import TelephoneFst as vTelephone
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord
from pynini.lib import pynutil
from nemo.utils import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = True,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != 'None':
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
else:
logging.info(f'Creating ClassifyFst grammars. This might take some time...')
# TAGGERS
cardinal = CardinalFst(deterministic=deterministic)
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
deterministic_ordinal = OrdinalFst(cardinal=cardinal, deterministic=True)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst
fraction = FractionFst(deterministic=deterministic, cardinal=cardinal)
fraction_graph = fraction.fst
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic)
measure_graph = measure.fst
date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst
punctuation = PunctuationFst(deterministic=True)
punct_graph = punctuation.graph
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst
telephone_graph = TelephoneFst(deterministic=deterministic).fst
electronic_graph = ElectronicFst(deterministic=deterministic).fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst
whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = whitelist.graph
serial_graph = SerialFst(cardinal=cardinal, ordinal=deterministic_ordinal, deterministic=deterministic).fst
# VERBALIZERS
cardinal = vCardinal(deterministic=deterministic)
v_cardinal_graph = cardinal.fst
decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
v_decimal_graph = decimal.fst
ordinal = vOrdinal(deterministic=deterministic)
v_ordinal_graph = ordinal.fst
fraction = vFraction(deterministic=deterministic)
v_fraction_graph = fraction.fst
v_telephone_graph = vTelephone(deterministic=deterministic).fst
v_electronic_graph = vElectronic(deterministic=deterministic).fst
measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic)
v_measure_graph = measure.fst
v_time_graph = vTime(deterministic=deterministic).fst
v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst
v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst
v_roman_graph = vRoman(deterministic=deterministic).fst
v_abbreviation = vAbbreviation(deterministic=deterministic).fst
det_v_time_graph = vTime(deterministic=True).fst
det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True), deterministic=True).fst
time_final = pynini.compose(time_graph, det_v_time_graph)
date_final = pynini.compose(date_graph, det_v_date_graph)
range_graph = RangeFst(
time=time_final, date=date_final, cardinal=CardinalFst(deterministic=True), deterministic=deterministic
).fst
v_word_graph = vWord(deterministic=deterministic).fst
sem_w = 1
word_w = 100
punct_w = 2
classify_and_verbalize = (
pynutil.add_weight(whitelist_graph, sem_w)
| pynutil.add_weight(pynini.compose(time_graph, v_time_graph), sem_w)
| pynutil.add_weight(pynini.compose(decimal_graph, v_decimal_graph), sem_w)
| pynutil.add_weight(pynini.compose(measure_graph, v_measure_graph), sem_w)
| pynutil.add_weight(pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
| pynutil.add_weight(pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
| pynutil.add_weight(pynini.compose(telephone_graph, v_telephone_graph), sem_w)
| pynutil.add_weight(pynini.compose(electronic_graph, v_electronic_graph), sem_w)
| pynutil.add_weight(pynini.compose(fraction_graph, v_fraction_graph), sem_w)
| pynutil.add_weight(pynini.compose(money_graph, v_money_graph), sem_w)
| pynutil.add_weight(word_graph, word_w)
| pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01)
| pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_word_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()
if not deterministic:
roman_graph = RomanFst(deterministic=deterministic).fst
# the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
classify_and_verbalize |= pynutil.add_weight(pynini.compose(roman_graph, v_roman_graph), word_w)
abbreviation_graph = AbbreviationFst(whitelist=whitelist, deterministic=deterministic).fst
classify_and_verbalize |= pynutil.add_weight(
pynini.compose(abbreviation_graph, v_abbreviation), word_w
)
punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct_only),
1,
)
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ classify_and_verbalize
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph |= punct_only + pynini.closure(punct)
graph = delete_space + graph + delete_space
remove_extra_spaces = pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(
delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)
)
remove_extra_spaces |= (
pynini.closure(pynutil.delete(" "), 1)
+ pynini.closure(NEMO_NOT_SPACE, 1)
+ pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))
)
graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize()
self.fst = graph
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(graph, no_digits).optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f'ClassifyFst grammars are saved to {far_file}.')

View File

@@ -0,0 +1,151 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_UPPER,
SINGULAR_TO_PLURAL,
GraphFst,
convert_space,
)
from nemo_text_processing.text_normalization.en.taggers.roman import get_names
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelist, e.g.
misses -> tokens { name: "mrs" }
for non-deterministic case: "Dr. Abc" ->
tokens { name: "drive" } tokens { name: "Abc" }
tokens { name: "doctor" } tokens { name: "Abc" }
tokens { name: "Dr." } tokens { name: "Abc" }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
input_file: path to a file with whitelist replacements
"""
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
whitelist = load_labels(file)
if input_case == "lower_cased":
whitelist = [[x.lower(), y] for x, y in whitelist]
else:
whitelist = [[x, y] for x, y in whitelist]
if keep_punct_add_end:
whitelist.extend(augment_labels_with_punct_at_end(whitelist))
graph = pynini.string_map(whitelist)
return graph
graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/UK_to_US.tsv")) # Jiayu 2022.10
graph |= pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.accep("/")).optimize(),
_get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")),
).optimize()
if deterministic:
names = get_names()
graph |= (
pynini.cross(pynini.union("st", "St", "ST"), "Saint")
+ pynini.closure(pynutil.delete("."))
+ pynini.accep(" ")
+ names
)
else:
graph |= _get_whitelist_graph(
input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
)
for x in [".", ". "]:
graph |= (
NEMO_UPPER
+ pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
+ pynini.closure(pynutil.delete("."), 0, 1)
)
if not deterministic:
multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
graph |= multiple_forms_whitelist_graph
graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
get_abs_path("data/measure/unit_alternatives.tsv")
)
graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
graph |= units_graph
# convert to states only if comma is present before the abbreviation to avoid converting all caps words,
# e.g. "IN", "OH", "OK"
# TODO or only exclude above?
states = load_labels(get_abs_path("data/address/state.tsv"))
additional_options = []
for x, y in states:
if input_case == "lower_cased":
x = x.lower()
additional_options.append((x, f"{y[0]}.{y[1:]}"))
if not deterministic:
additional_options.append((x, f"{y[0]}.{y[1:]}."))
states.extend(additional_options)
state_graph = pynini.string_map(states)
graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()
if input_file:
whitelist_provided = _get_whitelist_graph(input_case, input_file)
if not deterministic:
graph |= whitelist_provided
else:
graph = whitelist_provided
self.graph = (convert_space(graph)).optimize()
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def get_formats(input_f, input_case="cased", is_default=True):
"""
Adds various abbreviation format options to the list of acceptable input forms
"""
multiple_formats = load_labels(input_f)
additional_options = []
for x, y in multiple_formats:
if input_case == "lower_cased":
x = x.lower()
additional_options.append((f"{x}.", y)) # default "dr" -> doctor, this includes period "dr." -> doctor
additional_options.append((f"{x[0].upper() + x[1:]}", f"{y[0].upper() + y[1:]}")) # "Dr" -> Doctor
additional_options.append((f"{x[0].upper() + x[1:]}.", f"{y[0].upper() + y[1:]}")) # "Dr." -> Doctor
multiple_formats.extend(additional_options)
if not is_default:
multiple_formats = [(x, f"|raw_start|{x}|raw_end||norm_start|{y}|norm_end|") for (x, y) in multiple_formats]
multiple_formats = pynini.string_map(multiple_formats)
return multiple_formats

View File

@@ -0,0 +1,90 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
MIN_NEG_WEIGHT,
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
GraphFst,
convert_space,
get_abs_path,
)
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from pynini.examples import plurals
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for classifying word. Considers sentence boundary exceptions.
e.g. sleep -> tokens { name: "sleep" }
Args:
punctuation: PunctuationFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, punctuation: GraphFst, deterministic: bool = True):
super().__init__(name="word", kind="classify", deterministic=deterministic)
punct = PunctuationFst().graph
default_graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)
symbols_to_exclude = (pynini.union("$", "", "", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)
graph = pynutil.add_weight(graph, MIN_NEG_WEIGHT) | default_graph
# leave phones of format [HH AH0 L OW1] untouched
phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
phoneme = (
pynini.accep(pynini.escape("["))
+ pynini.closure(phoneme_unit + pynini.accep(" "))
+ phoneme_unit
+ pynini.accep(pynini.escape("]"))
)
# leave IPA phones of format [ˈdoʊv] untouched, single words and sentences with punctuation marks allowed
punct_marks = pynini.union(*punctuation.punct_marks).optimize()
stress = pynini.union("ˈ", "'", "ˌ")
ipa_phoneme_unit = pynini.string_file(get_abs_path("data/whitelist/ipa_symbols.tsv"))
# word in ipa form
ipa_phonemes = (
pynini.closure(stress, 0, 1)
+ pynini.closure(ipa_phoneme_unit, 1)
+ pynini.closure(stress | ipa_phoneme_unit)
)
# allow sentences of words in IPA format separated with spaces or punct marks
delim = (punct_marks | pynini.accep(" ")) ** (1, ...)
ipa_phonemes = ipa_phonemes + pynini.closure(delim + ipa_phonemes) + pynini.closure(delim, 0, 1)
ipa_phonemes = (pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]"))).optimize()
if not deterministic:
phoneme = (
pynini.accep(pynini.escape("["))
+ pynini.closure(pynini.accep(" "), 0, 1)
+ pynini.closure(phoneme_unit + pynini.accep(" "))
+ phoneme_unit
+ pynini.closure(pynini.accep(" "), 0, 1)
+ pynini.accep(pynini.escape("]"))
).optimize()
ipa_phonemes = (
pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]"))
).optimize()
phoneme |= ipa_phonemes
self.graph = plurals._priority_union(convert_space(phoneme.optimize()), graph, NEMO_SIGMA)
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()

View File

@@ -0,0 +1,60 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import os
def get_abs_path(rel_path):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
def load_labels(abs_path):
"""
loads relative path file as dictionary
Args:
abs_path: absolute path
Returns dictionary of mappings
"""
label_tsv = open(abs_path, encoding="utf-8")
labels = list(csv.reader(label_tsv, delimiter="\t"))
return labels
def augment_labels_with_punct_at_end(labels):
"""
augments labels: if key ends on a punctuation that value does not have, add a new label
where the value maintains the punctuation
Args:
labels : input labels
Returns:
additional labels
"""
res = []
for label in labels:
if len(label) > 1:
if label[0][-1] == "." and label[1][-1] != ".":
res.append([label[0], label[1] + "."] + label[2:])
return res

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,35 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class AbbreviationFst(GraphFst):
"""
Finite state transducer for verbalizing abbreviations
e.g. tokens { abbreviation { value: "A B C" } } -> "ABC"
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="abbreviation", kind="verbalize", deterministic=deterministic)
graph = pynutil.delete("value: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,45 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal, e.g.
cardinal { negative: "true" integer: "23" } -> minus twenty three
Args:
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
self.optional_sign = pynini.cross("negative: \"true\"", "minus ")
if not deterministic:
self.optional_sign |= pynini.cross("negative: \"true\"", "negative ")
self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
integer = pynini.closure(NEMO_NOT_QUOTE)
self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
integer = pynutil.delete("integer:") + self.integer
self.numbers = self.optional_sign + integer
delete_tokens = self.delete_tokens(self.numbers)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,101 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.examples import plurals
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { month: "february" day: "five" year: "twenty twelve" preserve_order: true } -> february fifth twenty twelve
date { day: "five" month: "february" year: "twenty twelve" preserve_order: true } -> the fifth of february twenty twelve
Args:
ordinal: OrdinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
month = pynini.closure(NEMO_NOT_QUOTE, 1)
day_cardinal = (
pynutil.delete("day:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
day = day_cardinal @ ordinal.suffix
month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"")
year = (
pynutil.delete("year:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ delete_space
+ pynutil.delete("\"")
)
# month (day) year
graph_mdy = (
month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1)
)
# may 5 -> may five
if not deterministic and not lm:
graph_mdy |= (
month
+ pynini.closure(delete_extra_space + day_cardinal, 0, 1)
+ pynini.closure(delete_extra_space + year, 0, 1)
)
# day month year
graph_dmy = (
pynutil.insert("the ")
+ day
+ delete_extra_space
+ pynutil.insert("of ")
+ month
+ pynini.closure(delete_extra_space + year, 0, 1)
)
optional_preserve_order = pynini.closure(
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
| pynutil.delete("field_order:")
+ delete_space
+ pynutil.delete("\"")
+ NEMO_NOT_QUOTE
+ pynutil.delete("\"")
+ delete_space
)
final_graph = (
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year)
+ delete_space
+ optional_preserve_order
)
delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,67 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "twelve" fractional_part: "five o o six" quantity: "billion" } -> minus twelve point five o o six billion
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
self.optional_sign = pynini.cross("negative: \"true\"", "minus ")
if not deterministic:
self.optional_sign |= pynini.cross("negative: \"true\"", "negative ")
self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
self.integer = pynutil.delete("integer_part:") + cardinal.integer
self.optional_integer = pynini.closure(self.integer + delete_space + insert_space, 0, 1)
self.fractional_default = (
pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
self.fractional = pynutil.insert("point ") + self.fractional_default
self.quantity = (
delete_space
+ insert_space
+ pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
self.optional_quantity = pynini.closure(self.quantity, 0, 1)
graph = self.optional_sign + (
self.integer
| (self.integer + self.quantity)
| (self.optional_integer + self.fractional + self.optional_quantity)
)
self.numbers = graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,97 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_NOT_SPACE,
NEMO_SIGMA,
TO_UPPER,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.examples import plurals
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> c d f one at a b c dot e d u
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
graph_zero = pynini.cross("0", "zero")
if not deterministic:
graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")
graph_digit = graph_digit_no_zero | graph_zero
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
default_chars_symbols = pynini.cdrewrite(
pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
)
default_chars_symbols = pynini.compose(
pynini.closure(NEMO_NOT_SPACE), default_chars_symbols.optimize()
).optimize()
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete("\"")
+ default_chars_symbols
+ pynutil.delete("\"")
)
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
domain = (
default_chars_symbols
+ insert_space
+ plurals._priority_union(
domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
)
+ pynini.closure(
insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
)
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete("\"")
+ domain
+ delete_space
+ pynutil.delete("\"")
).optimize()
protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
graph = (
pynini.closure(protocol + delete_space, 0, 1)
+ pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1)
+ domain
+ delete_space
).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,88 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst
from pynini.examples import plurals
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for verbalizing fraction
e.g. tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } ->
twenty three and four fifth
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
suffix = OrdinalFst().suffix
integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
denominator_one = pynini.cross("denominator: \"one\"", "over one")
denominator_half = pynini.cross("denominator: \"two\"", "half")
denominator_quarter = pynini.cross("denominator: \"four\"", "quarter")
denominator_rest = (
pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) @ suffix + pynutil.delete("\"")
)
denominators = plurals._priority_union(
denominator_one,
plurals._priority_union(
denominator_half,
plurals._priority_union(denominator_quarter, denominator_rest, NEMO_SIGMA),
NEMO_SIGMA,
),
NEMO_SIGMA,
).optimize()
if not deterministic:
denominators |= pynutil.delete("denominator: \"") + (pynini.accep("four") @ suffix) + pynutil.delete("\"")
numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ")
numerator_one = numerator_one + insert_space + denominators
numerator_rest = (
pynutil.delete("numerator: \"")
+ (pynini.closure(NEMO_NOT_QUOTE) - pynini.accep("one"))
+ pynutil.delete("\" ")
)
numerator_rest = numerator_rest + insert_space + denominators
numerator_rest @= pynini.cdrewrite(
plurals._priority_union(pynini.cross("half", "halves"), pynutil.insert("s"), NEMO_SIGMA),
"",
"[EOS]",
NEMO_SIGMA,
)
graph = numerator_one | numerator_rest
conjunction = pynutil.insert("and ")
if not deterministic and not lm:
conjunction = pynini.closure(conjunction, 0, 1)
integer = pynini.closure(integer + insert_space + conjunction, 0, 1)
graph = integer + graph
graph @= pynini.cdrewrite(
pynini.cross("and one half", "and a half") | pynini.cross("over ones", "over one"), "", "[EOS]", NEMO_SIGMA
)
self.graph = graph
delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()

Some files were not shown because too many files have changed in this diff Show More