This commit is contained in:
zhousha
2025-08-06 15:38:55 +08:00
parent 4916ad0fe0
commit 55a67e817e
193 changed files with 51647 additions and 1 deletions

View File

@@ -0,0 +1,10 @@
# Text Normalization
Text Normalization is part of NeMo's `nemo_text_processing` - a Python package that is installed with the `nemo_toolkit`.
It converts text from written form into its verbalized form, e.g. "123" -> "one hundred twenty three".
See [NeMo documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_normalization.html) for details.
Tutorial with overview of the package capabilities: [Text_(Inverse)_Normalization.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb)
Tutorial on how to customize the underlying gramamrs: [WFST_Tutorial.ipynb](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,350 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
import string
from collections import defaultdict, namedtuple
from typing import Dict, List, Optional, Set, Tuple
from unicodedata import category
EOS_TYPE = "EOS"
PUNCT_TYPE = "PUNCT"
PLAIN_TYPE = "PLAIN"
Instance = namedtuple('Instance', 'token_type un_normalized normalized')
known_types = [
"PLAIN",
"DATE",
"CARDINAL",
"LETTERS",
"VERBATIM",
"MEASURE",
"DECIMAL",
"ORDINAL",
"DIGIT",
"MONEY",
"TELEPHONE",
"ELECTRONIC",
"FRACTION",
"TIME",
"ADDRESS",
]
def _load_kaggle_text_norm_file(file_path: str) -> List[Instance]:
"""
https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
Loads text file in the Kaggle Google text normalization file format: <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
E.g.
PLAIN Brillantaisia <self>
PLAIN is <self>
PLAIN a <self>
PLAIN genus <self>
PLAIN of <self>
PLAIN plant <self>
PLAIN in <self>
PLAIN family <self>
PLAIN Acanthaceae <self>
PUNCT . sil
<eos> <eos>
Args:
file_path: file path to text file
Returns: flat list of instances
"""
res = []
with open(file_path, 'r') as fp:
for line in fp:
parts = line.strip().split("\t")
if parts[0] == "<eos>":
res.append(Instance(token_type=EOS_TYPE, un_normalized="", normalized=""))
else:
l_type, l_token, l_normalized = parts
l_token = l_token.lower()
l_normalized = l_normalized.lower()
if l_type == PLAIN_TYPE:
res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_token))
elif l_type != PUNCT_TYPE:
res.append(Instance(token_type=l_type, un_normalized=l_token, normalized=l_normalized))
return res
def load_files(file_paths: List[str], load_func=_load_kaggle_text_norm_file) -> List[Instance]:
"""
Load given list of text files using the `load_func` function.
Args:
file_paths: list of file paths
load_func: loading function
Returns: flat list of instances
"""
res = []
for file_path in file_paths:
res.extend(load_func(file_path=file_path))
return res
def clean_generic(text: str) -> str:
"""
Cleans text without affecting semiotic classes.
Args:
text: string
Returns: cleaned string
"""
text = text.strip()
text = text.lower()
return text
def evaluate(preds: List[str], labels: List[str], input: Optional[List[str]] = None, verbose: bool = True) -> float:
"""
Evaluates accuracy given predictions and labels.
Args:
preds: predictions
labels: labels
input: optional, only needed for verbosity
verbose: if true prints [input], golden labels and predictions
Returns accuracy
"""
acc = 0
nums = len(preds)
for i in range(nums):
pred_norm = clean_generic(preds[i])
label_norm = clean_generic(labels[i])
if pred_norm == label_norm:
acc = acc + 1
else:
if input:
print(f"inpu: {json.dumps(input[i])}")
print(f"gold: {json.dumps(label_norm)}")
print(f"pred: {json.dumps(pred_norm)}")
return acc / nums
def training_data_to_tokens(
data: List[Instance], category: Optional[str] = None
) -> Dict[str, Tuple[List[str], List[str]]]:
"""
Filters the instance list by category if provided and converts it into a map from token type to list of un_normalized and normalized strings
Args:
data: list of instances
category: optional semiotic class category name
Returns Dict: token type -> (list of un_normalized strings, list of normalized strings)
"""
result = defaultdict(lambda: ([], []))
for instance in data:
if instance.token_type != EOS_TYPE:
if category is None or instance.token_type == category:
result[instance.token_type][0].append(instance.un_normalized)
result[instance.token_type][1].append(instance.normalized)
return result
def training_data_to_sentences(data: List[Instance]) -> Tuple[List[str], List[str], List[Set[str]]]:
"""
Takes instance list, creates list of sentences split by EOS_Token
Args:
data: list of instances
Returns (list of unnormalized sentences, list of normalized sentences, list of sets of categories in a sentence)
"""
# split data at EOS boundaries
sentences = []
sentence = []
categories = []
sentence_categories = set()
for instance in data:
if instance.token_type == EOS_TYPE:
sentences.append(sentence)
sentence = []
categories.append(sentence_categories)
sentence_categories = set()
else:
sentence.append(instance)
sentence_categories.update([instance.token_type])
un_normalized = [" ".join([instance.un_normalized for instance in sentence]) for sentence in sentences]
normalized = [" ".join([instance.normalized for instance in sentence]) for sentence in sentences]
return un_normalized, normalized, categories
def post_process_punctuation(text: str) -> str:
"""
Normalized quotes and spaces
Args:
text: text
Returns: text with normalized spaces and quotes
"""
text = (
text.replace('( ', '(')
.replace(' )', ')')
.replace('{ ', '{')
.replace(' }', '}')
.replace('[ ', '[')
.replace(' ]', ']')
.replace(' ', ' ')
.replace('', '"')
.replace("", "'")
.replace("»", '"')
.replace("«", '"')
.replace("\\", "")
.replace("", '"')
.replace("´", "'")
.replace("", "'")
.replace('', '"')
.replace("", "'")
.replace('`', "'")
.replace('- -', "--")
)
for punct in "!,.:;?":
text = text.replace(f' {punct}', punct)
return text.strip()
def pre_process(text: str) -> str:
"""
Optional text preprocessing before normalization (part of TTS TN pipeline)
Args:
text: string that may include semiotic classes
Returns: text with spaces around punctuation marks
"""
space_both = '[]'
for punct in space_both:
text = text.replace(punct, ' ' + punct + ' ')
# remove extra space
text = re.sub(r' +', ' ', text)
return text
def load_file(file_path: str) -> List[str]:
"""
Loads given text file with separate lines into list of string.
Args:
file_path: file path
Returns: flat list of string
"""
res = []
with open(file_path, 'r') as fp:
for line in fp:
res.append(line)
return res
def write_file(file_path: str, data: List[str]):
"""
Writes out list of string to file.
Args:
file_path: file path
data: list of string
"""
with open(file_path, 'w') as fp:
for line in data:
fp.write(line + '\n')
def post_process_punct(input: str, normalized_text: str, add_unicode_punct: bool = False):
"""
Post-processing of the normalized output to match input in terms of spaces around punctuation marks.
After NN normalization, Moses detokenization puts a space after
punctuation marks, and attaches an opening quote "'" to the word to the right.
E.g., input to the TN NN model is "12 test' example",
after normalization and detokenization -> "twelve test 'example" (the quote is considered to be an opening quote,
but it doesn't match the input and can cause issues during TTS voice generation.)
The current function will match the punctuation and spaces of the normalized text with the input sequence.
"12 test' example" -> "twelve test 'example" -> "twelve test' example" (the quote was shifted to match the input).
Args:
input: input text (original input to the NN, before normalization or tokenization)
normalized_text: output text (output of the TN NN model)
add_unicode_punct: set to True to handle unicode punctuation marks as well as default string.punctuation (increases post processing time)
"""
# in the post-processing WFST graph "``" are repalced with '"" quotes (otherwise single quotes "`" won't be handled correctly)
# this function fixes spaces around them based on input sequence, so here we're making the same double quote replacement
# to make sure these new double quotes work with this function
if "``" in input and "``" not in normalized_text:
input = input.replace("``", '"')
input = [x for x in input]
normalized_text = [x for x in normalized_text]
punct_marks = [x for x in string.punctuation if x in input]
if add_unicode_punct:
punct_unicode = [
chr(i)
for i in range(sys.maxunicode)
if category(chr(i)).startswith("P") and chr(i) not in punct_default and chr(i) in input
]
punct_marks = punct_marks.extend(punct_unicode)
for punct in punct_marks:
try:
equal = True
if input.count(punct) != normalized_text.count(punct):
equal = False
idx_in, idx_out = 0, 0
while punct in input[idx_in:]:
idx_out = normalized_text.index(punct, idx_out)
idx_in = input.index(punct, idx_in)
def _is_valid(idx_out, idx_in, normalized_text, input):
"""Check if previous or next word match (for cases when punctuation marks are part of
semiotic token, i.e. some punctuation can be missing in the normalized text)"""
return (idx_out > 0 and idx_in > 0 and normalized_text[idx_out - 1] == input[idx_in - 1]) or (
idx_out < len(normalized_text) - 1
and idx_in < len(input) - 1
and normalized_text[idx_out + 1] == input[idx_in + 1]
)
if not equal and not _is_valid(idx_out, idx_in, normalized_text, input):
idx_in += 1
continue
if idx_in > 0 and idx_out > 0:
if normalized_text[idx_out - 1] == " " and input[idx_in - 1] != " ":
normalized_text[idx_out - 1] = ""
elif normalized_text[idx_out - 1] != " " and input[idx_in - 1] == " ":
normalized_text[idx_out - 1] += " "
if idx_in < len(input) - 1 and idx_out < len(normalized_text) - 1:
if normalized_text[idx_out + 1] == " " and input[idx_in + 1] != " ":
normalized_text[idx_out + 1] = ""
elif normalized_text[idx_out + 1] != " " and input[idx_in + 1] == " ":
normalized_text[idx_out] = normalized_text[idx_out] + " "
idx_out += 1
idx_in += 1
except:
pass
normalized_text = "".join(normalized_text)
return re.sub(r' +', ' ', normalized_text)

View File

@@ -0,0 +1,17 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst

View File

@@ -0,0 +1,342 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from argparse import ArgumentParser
from typing import List
import regex as re
from nemo_text_processing.text_normalization.data_loader_utils import (
EOS_TYPE,
Instance,
load_files,
training_data_to_sentences,
)
"""
This file is for evaluation purposes.
filter_loaded_data() cleans data (list of instances) for text normalization. Filters and cleaners can be specified for each semiotic class individually.
For example, normalized text should only include characters and whitespace characters but no punctuation.
Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
"""
class Filter:
"""
Filter class
Args:
class_type: semiotic class used in dataset
process_func: function to transform text
filter_func: function to filter text
"""
def __init__(self, class_type: str, process_func: object, filter_func: object):
self.class_type = class_type
self.process_func = process_func
self.filter_func = filter_func
def filter(self, instance: Instance) -> bool:
"""
filter function
Args:
filters given instance with filter function
Returns: True if given instance fulfills criteria or does not belong to class type
"""
if instance.token_type != self.class_type:
return True
return self.filter_func(instance)
def process(self, instance: Instance) -> Instance:
"""
process function
Args:
processes given instance with process function
Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
return instance
return self.process_func(instance)
def filter_cardinal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_cardinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[^0-9]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_ordinal_1(instance: Instance) -> bool:
ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
return ok
def process_ordinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[,\s]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_decimal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_decimal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_measure_1(instance: Instance) -> bool:
ok = True
return ok
def process_measure_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"m2", "", un_normalized)
un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
normalized = re.sub(r"[^a-z\s]", "", normalized)
normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_money_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_money_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"a\$", r"$", un_normalized)
un_normalized = re.sub(r"us\$", r"$", un_normalized)
un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_time_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_time_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r": ", ":", un_normalized)
un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_plain_1(instance: Instance) -> bool:
ok = True
return ok
def process_plain_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_punct_1(instance: Instance) -> bool:
ok = True
return ok
def process_punct_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_date_1(instance: Instance) -> bool:
ok = True
return ok
def process_date_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_letters_1(instance: Instance) -> bool:
ok = True
return ok
def process_letters_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_verbatim_1(instance: Instance) -> bool:
ok = True
return ok
def process_verbatim_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_digit_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_digit_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_telephone_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_telephone_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_electronic_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_electronic_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_fraction_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_fraction_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def filter_address_1(instance: Instance) -> bool:
ok = True
return ok
def process_address_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
filters = []
filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1))
filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1))
filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1))
filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1))
filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1))
filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1))
filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1))
filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1))
filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1))
filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1))
filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
"""
Filters list of instances
Args:
data: list of instances
Returns: filtered and transformed list of instances
"""
updates_instances = []
for instance in data:
updated_instance = False
for fil in filters:
if fil.class_type == instance.token_type and fil.filter(instance):
instance = fil.process(instance)
updated_instance = True
if updated_instance:
if verbose:
print(instance)
updates_instances.append(instance)
return updates_instances
def parse_args():
parser = ArgumentParser()
parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100')
parser.add_argument("--verbose", help="print filtered instances", action='store_true')
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
file_path = args.input
print("Loading training data: " + file_path)
instance_list = load_files([file_path]) # List of instances
filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
training_data_to_sentences(filtered_instance_list)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,14 @@
st Street
street Street
expy Expressway
fwy Freeway
hwy Highway
dr Drive
ct Court
ave Avenue
av Avenue
cir Circle
blvd Boulevard
alley Alley
way Way
jct Junction
1 st Street
2 street Street
3 expy Expressway
4 fwy Freeway
5 hwy Highway
6 dr Drive
7 ct Court
8 ave Avenue
9 av Avenue
10 cir Circle
11 blvd Boulevard
12 alley Alley
13 way Way
14 jct Junction

View File

@@ -0,0 +1,52 @@
Alabama AL
Alaska AK
Arizona AZ
Arkansas AR
California CA
Colorado CO
Connecticut CT
Delaware DE
Florida FL
Georgia GA
Hawaii HI
Idaho ID
Illinois IL
Indiana IN
Indiana IND
Iowa IA
Kansas KS
Kentucky KY
Louisiana LA
Maine ME
Maryland MD
Massachusetts MA
Michigan MI
Minnesota MN
Mississippi MS
Missouri MO
Montana MT
Nebraska NE
Nevada NV
New Hampshire NH
New Jersey NJ
New Mexico NM
New York NY
North Carolina NC
North Dakota ND
Ohio OH
Oklahoma OK
Oregon OR
Pennsylvania PA
Rhode Island RI
South Carolina SC
South Dakota SD
Tennessee TN
Tennessee TENN
Texas TX
Utah UT
Vermont VT
Virginia VA
Washington WA
West Virginia WV
Wisconsin WI
Wyoming WY
1 Alabama AL
2 Alaska AK
3 Arizona AZ
4 Arkansas AR
5 California CA
6 Colorado CO
7 Connecticut CT
8 Delaware DE
9 Florida FL
10 Georgia GA
11 Hawaii HI
12 Idaho ID
13 Illinois IL
14 Indiana IN
15 Indiana IND
16 Iowa IA
17 Kansas KS
18 Kentucky KY
19 Louisiana LA
20 Maine ME
21 Maryland MD
22 Massachusetts MA
23 Michigan MI
24 Minnesota MN
25 Mississippi MS
26 Missouri MO
27 Montana MT
28 Nebraska NE
29 Nevada NV
30 New Hampshire NH
31 New Jersey NJ
32 New Mexico NM
33 New York NY
34 North Carolina NC
35 North Dakota ND
36 Ohio OH
37 Oklahoma OK
38 Oregon OR
39 Pennsylvania PA
40 Rhode Island RI
41 South Carolina SC
42 South Dakota SD
43 Tennessee TN
44 Tennessee TENN
45 Texas TX
46 Utah UT
47 Vermont VT
48 Virginia VA
49 Washington WA
50 West Virginia WV
51 Wisconsin WI
52 Wyoming WY

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,31 @@
one
two
three
four
five
six
seven
eight
nine
ten
eleven
twelve
thirteen
fourteen
fifteen
sixteen
seventeen
eighteen
nineteen
twenty
twenty one
twenty two
twenty three
twenty four
twenty five
twenty six
twenty seven
twenty eight
twenty nine
thirty
thirty one
1 one
2 two
3 three
4 four
5 five
6 six
7 seven
8 eight
9 nine
10 ten
11 eleven
12 twelve
13 thirteen
14 fourteen
15 fifteen
16 sixteen
17 seventeen
18 eighteen
19 nineteen
20 twenty
21 twenty one
22 twenty two
23 twenty three
24 twenty four
25 twenty five
26 twenty six
27 twenty seven
28 twenty eight
29 twenty nine
30 thirty
31 thirty one

View File

@@ -0,0 +1,12 @@
jan january
feb february
mar march
apr april
jun june
jul july
aug august
sep september
sept september
oct october
nov november
dec december
1 jan january
2 feb february
3 mar march
4 apr april
5 jun june
6 jul july
7 aug august
8 sep september
9 sept september
10 oct october
11 nov november
12 dec december

View File

@@ -0,0 +1,12 @@
january
february
march
april
may
june
july
august
september
october
november
december
1 january
2 february
3 march
4 april
5 may
6 june
7 july
8 august
9 september
10 october
11 november
12 december

View File

@@ -0,0 +1,24 @@
1 january
2 february
3 march
4 april
5 may
6 june
7 july
8 august
9 september
10 october
11 november
12 december
01 january
02 february
03 march
04 april
05 may
06 june
07 july
08 august
09 september
10 october
11 november
12 december
1 1 january
2 2 february
3 3 march
4 4 april
5 5 may
6 6 june
7 7 july
8 8 august
9 9 september
10 10 october
11 11 november
12 12 december
13 01 january
14 02 february
15 03 march
16 04 april
17 05 may
18 06 june
19 07 july
20 08 august
21 09 september
22 10 october
23 11 november
24 12 december

View File

@@ -0,0 +1,16 @@
A. D AD
A.D AD
a. d AD
a.d AD
a. d. AD
a.d. AD
B. C BC
B.C BC
b. c BC
b.c BC
A. D. AD
A.D. AD
B. C. BC
B.C. BC
b. c. BC
b.c. BC
1 A. D AD
2 A.D AD
3 a. d AD
4 a.d AD
5 a. d. AD
6 a.d. AD
7 B. C BC
8 B.C BC
9 b. c BC
10 b.c BC
11 A. D. AD
12 A.D. AD
13 B. C. BC
14 B.C. BC
15 b. c. BC
16 b.c. BC

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,12 @@
.com dot com
.org dot org
.gov dot gov
.uk dot UK
.fr dot FR
.net dot net
.br dot BR
.in dot IN
.ru dot RU
.de dot DE
.it dot IT
.jpg dot jpeg
1 .com dot com
2 .org dot org
3 .gov dot gov
4 .uk dot UK
5 .fr dot FR
6 .net dot net
7 .br dot BR
8 .in dot IN
9 .ru dot RU
10 .de dot DE
11 .it dot IT
12 .jpg dot jpeg

View File

@@ -0,0 +1,21 @@
. dot
- dash
_ underscore
! exclamation mark
# number sign
$ dollar sign
% percent sign
& ampersand
' quote
* asterisk
+ plus
/ slash
= equal sign
? question mark
^ circumflex
` right single quote
{ left brace
| vertical bar
} right brace
~ tilde
, comma
1 . dot
2 - dash
3 _ underscore
4 ! exclamation mark
5 # number sign
6 $ dollar sign
7 % percent sign
8 & ampersand
9 ' quote
10 * asterisk
11 + plus
12 / slash
13 = equal sign
14 ? question mark
15 ^ circumflex
16 ` right single quote
17 { left brace
18 | vertical bar
19 } right brace
20 ~ tilde
21 , comma

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,8 @@
+ plus
- minus
/ divided
÷ divided
: divided
× times
* times
· times
1 + plus
2 - minus
3 / divided
4 ÷ divided
5 : divided
6 × times
7 * times
8 · times

View File

@@ -0,0 +1,127 @@
amu atomic mass unit
bar bar
° degree
º degree
°c degree Celsius
°C degree Celsius
ºc degree Celsius
ºC degree Celsius
℃ degree Celsius
cm2 square centimeter
cm² square centimeter
cm3 cubic centimeter
cm³ cubic centimeter
cm centimeter
cwt hundredweight
db decibel
dm3 cubic decimeter
dm³ cubic decimeter
dm decimeter
ds decisecond
°f degree Fahrenheit
°F degree Fahrenheit
℉ degree Fahrenheit
ft foot
ghz gigahertz
gw gigawatt
gwh gigawatt hour
hz hertz
" inch
kbps kilobit per second
kcal kilo calory
kgf kilogram force
kg kilogram
khz kilohertz
km2 square kilometer
km² square kilometer
km3 cubic kilometer
km³ cubic kilometer
km kilometer
kpa kilopascal
kwh kilowatt hour
kw kilowatt
kW kilowatt
lb pound
lbs pound
m2 square meter
m² square meter
m3 cubic meter
m³ cubic meter
mbps megabit per second
mg milligram
mhz megahertz
mi2 square mile
mi² square mile
mi3 cubic mile
mi³ cubic mile
cu mi cubic mile
mi mile
min minute
ml milliliter
mm2 square millimeter
mm² square millimeter
mol mole
mpa megapascal
mph mile per hour
ng nanogram
nm nanometer
ns nanosecond
oz ounce
pa pascal
% percent
rad radian
rpm revolution per minute
sq ft square foot
sq mi square mile
sv sievert
tb terabyte
tj terajoule
tl teraliter
v volt
yd yard
μg microgram
μm micrometer
μs microsecond
ω ohm
atm ATM
au AU
bq BQ
cc CC
cd CD
da DA
eb EB
ev EV
f F
gb GB
g G
gl GL
gpa GPA
gy GY
ha HA
h H
hl HL
hp GP
hs HS
kb KB
kl KL
kn KN
kt KT
kv KV
lm LM
ma MA
mA MA
mb MB
mc MC
mf MF
m M
mm MM
ms MS
mv MV
mw MW
pb PB
pg PG
ps PS
s S
tb TB
tb YB
zb ZB
Can't render this file because it contains an unexpected character in line 127 and column 6.

View File

@@ -0,0 +1,43 @@
atm atmosphere
bq becquerel
cd candela
da dalton
eb exabyte
f degree Fahrenheit
gb gigabyte
g gram
gl gigaliter
ha hectare
h hour
hl hectoliter
hp horsepower
hp horsepower
kb kilobit
kb kilobyte
ma megaampere
mA megaampere
ma milliampere
mA milliampere
mb megabyte
mc megacoulomb
mf megafarad
m meter
m minute
mm millimeter
mm millimeter
mm millimeter
ms megasecond
ms mega siemens
ms millisecond
mv millivolt
mV millivolt
mw megawatt
mW megawatt
pb petabyte
pg petagram
ps petasecond
s second
tb terabyte
tb terabyte
yb yottabyte
zb zettabyte
1 atm atmosphere
2 bq becquerel
3 cd candela
4 da dalton
5 eb exabyte
6 f degree Fahrenheit
7 gb gigabyte
8 g gram
9 gl gigaliter
10 ha hectare
11 h hour
12 hl hectoliter
13 hp horsepower
14 hp horsepower
15 kb kilobit
16 kb kilobyte
17 ma megaampere
18 mA megaampere
19 ma milliampere
20 mA milliampere
21 mb megabyte
22 mc megacoulomb
23 mf megafarad
24 m meter
25 m minute
26 mm millimeter
27 mm millimeter
28 mm millimeter
29 ms megasecond
30 ms mega siemens
31 ms millisecond
32 mv millivolt
33 mV millivolt
34 mw megawatt
35 mW megawatt
36 pb petabyte
37 pg petagram
38 ps petasecond
39 s second
40 tb terabyte
41 tb terabyte
42 yb yottabyte
43 zb zettabyte

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,39 @@
$ dollar
$ us dollar
US$ us dollar
฿ Thai Baht
£ pound
€ euro
₩ won
nzd new zealand dollar
rs rupee
chf swiss franc
dkk danish kroner
fim finnish markka
aed arab emirates dirham
¥ yen
czk czech koruna
mro mauritanian ouguiya
pkr pakistani rupee
crc costa rican colon
hk$ hong kong dollar
npr nepalese rupee
awg aruban florin
nok norwegian kroner
tzs tanzanian shilling
sek swedish kronor
cyp cypriot pound
r real
sar saudi riyal
cve cape verde escudo
rsd serbian dinar
dm german mark
shp saint helena pounds
php philippine peso
cad canadian dollar
ssp south sudanese pound
scr seychelles rupee
mvr maldivian rufiyaa
DH dirham
Dh dirham
Dhs. dirham
1 $ dollar
2 $ us dollar
3 US$ us dollar
4 ฿ Thai Baht
5 £ pound
6 euro
7 won
8 nzd new zealand dollar
9 rs rupee
10 chf swiss franc
11 dkk danish kroner
12 fim finnish markka
13 aed arab emirates dirham
14 ¥ yen
15 czk czech koruna
16 mro mauritanian ouguiya
17 pkr pakistani rupee
18 crc costa rican colon
19 hk$ hong kong dollar
20 npr nepalese rupee
21 awg aruban florin
22 nok norwegian kroner
23 tzs tanzanian shilling
24 sek swedish kronor
25 cyp cypriot pound
26 r real
27 sar saudi riyal
28 cve cape verde escudo
29 rsd serbian dinar
30 dm german mark
31 shp saint helena pounds
32 php philippine peso
33 cad canadian dollar
34 ssp south sudanese pound
35 scr seychelles rupee
36 mvr maldivian rufiyaa
37 DH dirham
38 Dh dirham
39 Dhs. dirham

View File

@@ -0,0 +1,4 @@
$ cents
US$ cents
€ cents
£ pence
1 $ cents
2 US$ cents
3 cents
4 £ pence

View File

@@ -0,0 +1,3 @@
$ cent
€ cent
£ penny
1 $ cent
2 cent
3 £ penny

View File

@@ -0,0 +1,2 @@
/ea each
/dozen
Can't render this file because it has a wrong number of fields in line 2.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,9 @@
one 1
two 2
three 3
four 4
five 5
six 6
seven 7
eight 8
nine 9
1 one 1
2 two 2
3 three 3
4 four 4
5 five 5
6 six 6
7 seven 7
8 eight 8
9 nine 9

View File

@@ -0,0 +1,18 @@
¼ 1/4
½ 1/2
¾ 3/4
⅐ 1/7
⅑ 1/9
⅒ 1/10
⅓ 1/3
⅔ 2/3
⅕ 1/5
⅖ 2/5
⅗ 3/5
⅘ 4/5
⅙ 1/6
⅚ 5/6
⅛ 1/8
⅜ 3/8
⅝ 5/8
⅞ 7/8
1 ¼ 1/4
2 ½ 1/2
3 ¾ 3/4
4 1/7
5 1/9
6 1/10
7 1/3
8 2/3
9 1/5
10 2/5
11 3/5
12 4/5
13 1/6
14 5/6
15 1/8
16 3/8
17 5/8
18 7/8

View File

@@ -0,0 +1 @@
hundred
1 hundred

View File

@@ -0,0 +1,10 @@
M million
MLN million
m million
mln million
B billion
b billion
BN billion
bn billion
K thousand
k thousand
1 M million
2 MLN million
3 m million
4 mln million
5 B billion
6 b billion
7 BN billion
8 bn billion
9 K thousand
10 k thousand

View File

@@ -0,0 +1,10 @@
ten 10
eleven 11
twelve 12
thirteen 13
fourteen 14
fifteen 15
sixteen 16
seventeen 17
eighteen 18
nineteen 19
1 ten 10
2 eleven 11
3 twelve 12
4 thirteen 13
5 fourteen 14
6 fifteen 15
7 sixteen 16
8 seventeen 17
9 eighteen 18
10 nineteen 19

View File

@@ -0,0 +1,22 @@
thousand
million
billion
trillion
quadrillion
quintillion
sextillion
septillion
octillion
nonillion
decillion
undecillion
duodecillion
tredecillion
quattuordecillion
quindecillion
sexdecillion
septendecillion
octodecillion
novemdecillion
vigintillion
centillion
1 thousand
2 million
3 billion
4 trillion
5 quadrillion
6 quintillion
7 sextillion
8 septillion
9 octillion
10 nonillion
11 decillion
12 undecillion
13 duodecillion
14 tredecillion
15 quattuordecillion
16 quindecillion
17 sexdecillion
18 septendecillion
19 octodecillion
20 novemdecillion
21 vigintillion
22 centillion

View File

@@ -0,0 +1,8 @@
twenty 2
thirty 3
forty 4
fifty 5
sixty 6
seventy 7
eighty 8
ninety 9
1 twenty 2
2 thirty 3
3 forty 4
4 fifty 5
5 sixty 6
6 seventy 7
7 eighty 8
8 ninety 9

View File

@@ -0,0 +1 @@
zero 0
1 zero 0

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,9 @@
first one
second two
third three
fourth four
fifth five
sixth sixth
seventh seven
eighth eight
ninth nine
1 first one
2 second two
3 third three
4 fourth four
5 fifth five
6 sixth sixth
7 seventh seven
8 eighth eight
9 ninth nine

View File

@@ -0,0 +1 @@
twelfth twelve
1 twelfth twelve

View File

@@ -0,0 +1,20 @@
`female.tsv` - List of common female names. Copyright (c) January 1991 by Mark Kantrowitz, 4987 names, Version 1.3 (29-MAR-94)
Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt)
`male.tsv` - List of common male names. Copyright (c) January 1991 by Mark Kantrowitz, 2940 names, Version 1.3 (29-MAR-94)
Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt)
[Corpora Readme.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/readme.txt):
You may use the lists of names for any purpose, so long as credit is given
in any published work. You may also redistribute the list if you
provide the recipients with a copy of this README file. The lists are
not in the public domain (I retain the copyright on the lists) but are
freely redistributable.
If you have any additions to the lists of names, I would appreciate
receiving them.
My email address is mkant+@cs.cmu.edu.
Mark Kantrowitz

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
chapter
class
part
article
section
paragraph
1 chapter
2 class
3 part
4 article
5 section
6 paragraph

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,83 @@
deer
fish
sheep
foot feet
goose geese
man men
mouse mice
tooth teeth
woman women
won
child children
ox oxen
wife wives
wolf wolves
analysis analyses
criterion criteria
lbs
focus foci
percent
hertz
kroner krone
inch inches
calory calories
yen
megahertz
gigahertz
kilohertz
hertz
CC
c c
horsepower
hundredweight
kilogram force kilograms force
mega siemens
revolution per minute revolutions per minute
mile per hour miles per hour
megabit per second megabits per second
square foot square feet
kilobit per second kilobits per second
degree Celsius degrees Celsius
degree Fahrenheit degrees Fahrenheit
ATM
AU
BQ
CC
CD
DA
EB
EV
F
GB
G
GL
GPA
GY
HA
H
HL
GP
HS
KB
KL
KN
KT
KV
LM
MA
MA
MB
MC
MF
M
MM
MS
MV
MW
PB
PG
PS
S
TB
YB
ZB
Can't render this file because it has a wrong number of fields in line 4.

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,2 @@
IP address is
IP is
1 IP address is
2 IP is

View File

@@ -0,0 +1,4 @@
ssn is SSN is
ssn is SSN is
SSN is
SSN
Can't render this file because it has a wrong number of fields in line 3.

View File

@@ -0,0 +1,5 @@
call me at
reach at
reached at
my number is
hit me up at
1 call me at
2 reach at
3 reached at
4 my number is
5 hit me up at

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,12 @@
p.m. PM
p.m PM
pm PM
P.M. PM
P.M PM
PM PM
a.m. AM
a.m AM
am AM
A.M. AM
A.M AM
AM AM
1 p.m. PM
2 p.m PM
3 pm PM
4 P.M. PM
5 P.M PM
6 PM PM
7 a.m. AM
8 a.m AM
9 am AM
10 A.M. AM
11 A.M AM
12 AM AM

View File

@@ -0,0 +1,14 @@
cst CST
c.s.t CST
cet CET
c.e.t CET
pst PST
p.s.t PST
est EST
e.s.t EST
pt PT
p.t PT
et ET
e.t ET
gmt GMT
g.m.t GMT
1 cst CST
2 c.s.t CST
3 cet CET
4 c.e.t CET
5 pst PST
6 p.s.t PST
7 est EST
8 e.s.t EST
9 pt PT
10 p.t PT
11 et ET
12 e.t ET
13 gmt GMT
14 g.m.t GMT

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,45 @@
Hon. Honorable
Mr. Mister
Mrs. Misses
Ms. Miss
Mr Mister
Mrs Misses
Ms Miss
AC air conditioning
AC air conditioner
AC air conditioners
AC alternating current
&Co. and Co.
&Co. and Company
Mon Monday
Tu Tuesday
Wed Wednesday
Th Thursday
Thur Thursday
Thurs Thursday
Fri Friday
Sat Saturday
Sun Sunday
Mon Mon
Tu Tu
Wed Wed
Th Th
Thur Thur
Thurs Thurs
Fri Fri
Sat Sat
Sun Sun
= equals
# number
No. number
No number
NO number
NO. number
NO nitrogen monoxide
NO NO
NO. NO.
No. No.
No No
VOL Volume
VOL. Volume
TV Television
1 Hon. Honorable
2 Mr. Mister
3 Mrs. Misses
4 Ms. Miss
5 Mr Mister
6 Mrs Misses
7 Ms Miss
8 AC air conditioning
9 AC air conditioner
10 AC air conditioners
11 AC alternating current
12 &Co. and Co.
13 &Co. and Company
14 Mon Monday
15 Tu Tuesday
16 Wed Wednesday
17 Th Thursday
18 Thur Thursday
19 Thurs Thursday
20 Fri Friday
21 Sat Saturday
22 Sun Sunday
23 Mon Mon
24 Tu Tu
25 Wed Wed
26 Th Th
27 Thur Thur
28 Thurs Thurs
29 Fri Fri
30 Sat Sat
31 Sun Sun
32 = equals
33 # number
34 No. number
35 No number
36 NO number
37 NO. number
38 NO nitrogen monoxide
39 NO NO
40 NO. NO.
41 No. No.
42 No No
43 VOL Volume
44 VOL. Volume
45 TV Television

View File

@@ -0,0 +1,14 @@
st street
st saint
dr doctor
dr drive
mt mount
sr senior
prof professor
mt mountain
sr senior
jr junior
vol volume
rd road
ave avenue
approx approximately
1 st street
2 st saint
3 dr doctor
4 dr drive
5 mt mount
6 sr senior
7 prof professor
8 mt mountain
9 sr senior
10 jr junior
11 vol volume
12 rd road
13 ave avenue
14 approx approximately

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,521 @@
a
aoj
aəj
aː
aːʲ
aː͡j
aː͡ɨ̯
ː
ː
a͡e
a͡i
a͡iː
a͡i̯
a͡j
a͡o
a͡u
a͡uː
a͡u̯
a͡w
a͡ə
a͡ɨ̯
ɪ
a͡ʊ
b
bː
c
cː
d
dː
d̪ʱ
d͡z
d͡zʷ
d͡zː
d͡ʑ
d͡ʒ
d͡ʒʱ
d͡ʒʲ
d͡ʒː
e
eː
eːʲ
eː͡j
ː
ẽ͡j̃
ː
e͡i
e͡iː
e͡ɨ̯
f
fː
h
hː
i
iəj
iəw
iː
iːʲ
ː
i͡u
i͡ə
i͡ɛ
j
jː
k
ː
ʼ
ʼ
kʼ
kː
k̚ʲ
k̟̚
k͡p̚
l
lː
m
ː
mː
n
nː
o
oː
oːʲ
õ͡j̃
õ͡w̃
ː
o͡u
o͡uː
p
ː
ʼ
pʼ
pː
p̚ʲ
p͜f
p͡f
q
qʼ
r
rː
ː
s
sʼ
sː
t
ː
ʼ
tʼ
tː
t̪ʰ
t͜s
t͡s
t͡sʰ
t͡sʰː
t͡sʲ
t͡sʷ
t͡sʼ
t͡sː
t͡ɕ
t͡ɕʰ
t͡ɕ͈
t͡ʂ
t͡ʂʼ
t͡ʃ
t͡ʃʰ
t͡ʃʰː
t͡ʃʲ
t͡ʃʷ
t͡ʃʼ
t͡ʃː
u
uəj
uː
uːʲ
ː
ũ͡j̃
u͡e
u͡i
u͡j
u͡ɔ
u͡ə
v
vː
w
x
xː
y
yː
yːʲ
z
zː
à
àː
á
áː
â
âː
ã
ã̠
æ
æː
æ̀
æ̀ː
æ̂
æ̂ː
æ͡ɪ
æ͡ʉ
ç
è
èː
é
éː
ê
êː
ì
ìː
í
íː
î
îː
ï
ð
ò
òː
ó
óː
ô
ôː
õ
õː
õ̞
ø
øː
øːʲ
ø̯
ù
ùː
ú
úː
û
ûː
ā
āː
ē
ēː
ĕ
ĕ͡ə
ě
ěː
ħ
ĩ
ĩː
ī
īː
ŋ
ŋʲ
ŋ̊
ŋ̍
ŋ̟
ŋ̩
ŋ͡m
ō
ŏ
ŏ͡ə
œ
œː
œ̃
œ͡i
œ͡iː
œ͡ʏ
ř
řː
ũ
ũː
ū
ūː
ŭ
ŭ͡ə
ǎ
ǎː
ǐ
ǐː
ǒ
ǒː
ǔ
ǔː
ǣ
ǣː
ɐ
ɐː
ɐ̃
ɐ̃͡j̃
ɐ̃͡w̃
ɐ̯
ɐ̯̯
ɑ
ɑː
ɑ̃
ɑ̃ː
ɒ
ɒʲ
ɒː
ɓ
ɔ
ɔː
ɔˤː
ɔ̀
ɔ̀ː
ɔ́
ɔ́ː
ɔ̃
ɔ̃ː
ɔ̰
ɔ͡i̯
ɔ͡ə
ɔ͡ɨ̯
ɔ͡ɪ
ɔ͡ʊ
ɕ
ɕʰ
ɕː
ɕ͈
ɖ
ɖʱ
ɗ
ɘ
ɘː
ə
əː
əˤ
ə̀
ə́
ə̃
ə̯
ə͡u̯
ə͡w
ə͡ɨ
ə͡ɨ̯
ɚ
ɛ
ɛʲ
ɛː
ɛˤː
ɛ̀
ɛ̀ː
ɛ́
ɛ́ː
ɛ̂
ɛ̂ː
ɛ̃
ɛ̃ː
ɛ̄
ɛ̄ː
ɛ̰
ɛ͡i
ɛ͡i̯
ɛ͡u
ɛ͡u̯
ɛ͡ɪ
ɛ͡ʊ
ɜ
ɜː
ɝ
ɝː
ɟ
ɟː
ɟ͡ʝ
ɡ
ɡʱ
ɡʲ
ɡʷ
ɡː
ɡ̊
ɣ
ɤ
ɥ
ɦ
ɨ
ɨəj
ɨː
ɨ̃ᵝ
ɨ̞
ɨ̥ᵝ
ɨ̯
ɨ͡u̯
ɨ͡w
ɨ͡ə
ɨᵝ
ɨᵝː
ɪ
ɪː
ɪ̀
ɪ́
ɪ̃
ɪ̯
ɪ̰
ɪ͡u̯
ɪ͡ʊ
ɫ
ɫː
ɬ
ɬʼ
ɭ
ɮ
ɯ
ɯː
ɯ̟̃ᵝ
ɯ̟̊ᵝ
ɯ̟ᵝ
ɯ̟ᵝː
ɰ
ɰ̃
ɰᵝ
ɱ
ɱ̩
ɲ
ɲː
ɲ̊
ɲ̟
ɳ
ɴ
ɸ
ɸʷ
ɹ
ɻ
ɽ
ɽʱ
ɾ
ɾʲ
ɾː
ɾ̝̊
ʀ
ʁ
ʁʷ
ʁː
ʂ
ʂʷ
ʃ
ʃʰ
ʃʲ
ʃʷ
ʃʷʼ
ʃʼ
ʃː
ʈ
ʈʰ
ʉ
ʉː
ʊ
ʊ̀
ʊ́
ʊ̃
ʊ̯
ʊ̯͡i
ʊ̯͡ɨ
ʊ̰
ʋ
ʌ
ʌ̹
ʍ
ʎ
ʏ
ʏː
ʏ̯
ʐ
ʐʷ
ʑ
ʒ
ʒʲ
ʒʷ
ʒː
ʔ
ʔʲ
ʔʷ
ʝ
˦ˀ˥
˦˥
˦˧˥
˦˩
˧ˀ˨
˧˦
˧˧
˧˨
˧˩
˨˩
˨˩˦
˨˩˨
β
θ
χ
χʷ
χː
ẽː
ẽ̞
1 a
2 aoj
3 aəj
4
5 aːʲ
6 aː͡j
7 aː͡ɨ̯
8
9 aˤː
10
11 a̠ː
12
13 a͡e
14 a͡i
15 a͡iː
16 a͡i̯
17 a͡j
18 a͡o
19 a͡u
20 a͡uː
21 a͡u̯
22 a͡w
23 a͡ə
24 a͡ɨ̯
25 a͡ɪ
26 a͡ʊ
27 b
28
29
30
31
32 c
33
34
35
36 d
37
38
39
40
41 d̪ʱ
42 d͡z
43 d͡zʷ
44 d͡zː
45 d͡ʑ
46 d͡ʒ
47 d͡ʒʱ
48 d͡ʒʲ
49 d͡ʒː
50 e
51
52 eːʲ
53 eː͡j
54 ẽː
55 ẽ͡j̃
56
57 e̞ː
58
59 e͡i
60 e͡iː
61 e͡ɨ̯
62 f
63
64
65 h
66
67 i
68 iəj
69 iəw
70
71
72 iːʲ
73 ĩː
74
75
76 i͡u
77 i͡ə
78 i͡ɛ
79 j
80
81
82 k
83
84 kʰː
85
86 kʲʼ
87
88 kʷʼ
89
90
91
92 k̚ʲ
93 k̟̚
94
95 k͡p̚
96 l
97
98
99
100
101 m
102
103 mʲː
104
105
106
107 n
108
109
110
111
112 o
113
114
115 oːʲ
116
117 õ͡j̃
118 õ͡w̃
119
120
121 o̞ː
122
123
124 o͡u
125 o͡uː
126 p
127
128 pʰː
129
130 pʷʼ
131
132
133
134 p̚ʲ
135
136 p͜f
137 p͡f
138 q
139
140
141 r
142
143
144
145 r̂ː
146
147
148 s
149
150
151
152
153
154 t
155
156 tʰː
157
158 tʷʼ
159
160
161
162
163 t̪ʰ
164
165 t͜s
166 t͡s
167 t͡sʰ
168 t͡sʰː
169 t͡sʲ
170 t͡sʷ
171 t͡sʼ
172 t͡sː
173 t͡ɕ
174 t͡ɕʰ
175 t͡ɕ͈
176 t͡ʂ
177 t͡ʂʼ
178 t͡ʃ
179 t͡ʃʰ
180 t͡ʃʰː
181 t͡ʃʲ
182 t͡ʃʷ
183 t͡ʃʼ
184 t͡ʃː
185 u
186 uəj
187
188
189 uːʲ
190 ũː
191 ũ͡j̃
192
193 u͡e
194 u͡i
195 u͡j
196 u͡ɔ
197 u͡ə
198 v
199
200
201 w
202
203 x
204
205
206 y
207
208 yːʲ
209
210 z
211
212
213
214 à
215 àː
216 á
217 áː
218 â
219 âː
220 ã
221 ã̠
222 æ
223 æː
224 æ̀
225 æ̀ː
226 æ̂
227 æ̂ː
228 æ͡ɪ
229 æ͡ʉ
230 ç
231 è
232 èː
233 é
234 éː
235 ê
236 êː
237 ì
238 ìː
239 í
240 íː
241 î
242 îː
243 ï
244 ð
245 ò
246 òː
247 ó
248 óː
249 ô
250 ôː
251 õ
252 õː
253 õ̞
254 ø
255 øː
256 øːʲ
257 ø̯
258 ù
259 ùː
260 ú
261 úː
262 û
263 ûː
264 ā
265 āː
266 ē
267 ēː
268 ĕ
269 ĕ͡ə
270 ě
271 ěː
272 ħ
273 ĩ
274 ĩː
275 ī
276 īː
277 ŋ
278 ŋʲ
279 ŋ̊
280 ŋ̍
281 ŋ̟
282 ŋ̩
283 ŋ͡m
284 ō
285 ŏ
286 ŏ͡ə
287 œ
288 œː
289 œ̃
290 œ͡i
291 œ͡iː
292 œ͡ʏ
293 ř
294 řː
295 ũ
296 ũː
297 ū
298 ūː
299 ŭ
300 ŭ͡ə
301 ǎ
302 ǎː
303 ǐ
304 ǐː
305 ǒ
306 ǒː
307 ǔ
308 ǔː
309 ǣ
310 ǣː
311 ɐ
312 ɐː
313 ɐ̃
314 ɐ̃͡j̃
315 ɐ̃͡w̃
316 ɐ̯
317 ɐ̯̯
318 ɑ
319 ɑː
320 ɑ̃
321 ɑ̃ː
322 ɒ
323 ɒʲ
324 ɒː
325 ɓ
326 ɔ
327 ɔː
328 ɔˤː
329 ɔ̀
330 ɔ̀ː
331 ɔ́
332 ɔ́ː
333 ɔ̃
334 ɔ̃ː
335 ɔ̰
336 ɔ͡i̯
337 ɔ͡ə
338 ɔ͡ɨ̯
339 ɔ͡ɪ
340 ɔ͡ʊ
341 ɕ
342 ɕʰ
343 ɕː
344 ɕ͈
345 ɖ
346 ɖʱ
347 ɗ
348 ɘ
349 ɘː
350 ə
351 əː
352 əˤ
353 ə̀
354 ə́
355 ə̃
356 ə̯
357 ə͡u̯
358 ə͡w
359 ə͡ɨ
360 ə͡ɨ̯
361 ɚ
362 ɛ
363 ɛʲ
364 ɛː
365 ɛˤː
366 ɛ̀
367 ɛ̀ː
368 ɛ́
369 ɛ́ː
370 ɛ̂
371 ɛ̂ː
372 ɛ̃
373 ɛ̃ː
374 ɛ̄
375 ɛ̄ː
376 ɛ̰
377 ɛ͡i
378 ɛ͡i̯
379 ɛ͡u
380 ɛ͡u̯
381 ɛ͡ɪ
382 ɛ͡ʊ
383 ɜ
384 ɜː
385 ɝ
386 ɝː
387 ɟ
388 ɟː
389 ɟ͡ʝ
390 ɡ
391 ɡʱ
392 ɡʲ
393 ɡʷ
394 ɡː
395 ɡ̊
396 ɣ
397 ɤ
398 ɥ
399 ɦ
400 ɨ
401 ɨəj
402 ɨː
403 ɨ̃ᵝ
404 ɨ̞
405 ɨ̥ᵝ
406 ɨ̯
407 ɨ͡u̯
408 ɨ͡w
409 ɨ͡ə
410 ɨᵝ
411 ɨᵝː
412 ɪ
413 ɪː
414 ɪ̀
415 ɪ́
416 ɪ̃
417 ɪ̯
418 ɪ̰
419 ɪ͡u̯
420 ɪ͡ʊ
421 ɫ
422 ɫː
423 ɬ
424 ɬʼ
425 ɭ
426 ɮ
427 ɯ
428 ɯː
429 ɯ̟̃ᵝ
430 ɯ̟̊ᵝ
431 ɯ̟ᵝ
432 ɯ̟ᵝː
433 ɰ
434 ɰ̃
435 ɰᵝ
436 ɱ
437 ɱ̩
438 ɲ
439 ɲː
440 ɲ̊
441 ɲ̟
442 ɳ
443 ɴ
444 ɸ
445 ɸʷ
446 ɹ
447 ɻ
448 ɽ
449 ɽʱ
450 ɾ
451 ɾʲ
452 ɾː
453 ɾ̝̊
454 ʀ
455 ʁ
456 ʁʷ
457 ʁː
458 ʂ
459 ʂʷ
460 ʃ
461 ʃʰ
462 ʃʲ
463 ʃʷ
464 ʃʷʼ
465 ʃʼ
466 ʃː
467 ʈ
468 ʈʰ
469 ʉ
470 ʉː
471 ʊ
472 ʊ̀
473 ʊ́
474 ʊ̃
475 ʊ̯
476 ʊ̯͡i
477 ʊ̯͡ɨ
478 ʊ̰
479 ʋ
480 ʌ
481 ʌ̹
482 ʍ
483 ʎ
484 ʏ
485 ʏː
486 ʏ̯
487 ʐ
488 ʐʷ
489 ʑ
490 ʒ
491 ʒʲ
492 ʒʷ
493 ʒː
494 ʔ
495 ʔʲ
496 ʔʷ
497 ʝ
498 ˦ˀ˥
499 ˦˥
500 ˦˧˥
501 ˦˩
502 ˧ˀ˨
503 ˧˦
504 ˧˧
505 ˧˨
506 ˧˩
507 ˨˩
508 ˨˩˦
509 ˨˩˨
510 β
511 θ
512 χ
513 χʷ
514 χː
515
516
517
518
519 ẽː
520 ẽ̞
521

View File

@@ -0,0 +1,21 @@
Mr. mister
Mrs. misses
Dr. doctor
Drs. doctors
Co. company
Lt. lieutenant
Sgt. sergeant
St. saint
Jr. junior
Maj. major
Hon. honorable
Gov. governor
Capt. captain
Esq. esquire
Gen. general
Ltd. limited
Rev. reverend
Col. colonel
Mt. mount
Ft. fort
etc. et cetera
1 Mr. mister
2 Mrs. misses
3 Dr. doctor
4 Drs. doctors
5 Co. company
6 Lt. lieutenant
7 Sgt. sergeant
8 St. saint
9 Jr. junior
10 Maj. major
11 Hon. honorable
12 Gov. governor
13 Capt. captain
14 Esq. esquire
15 Gen. general
16 Ltd. limited
17 Rev. reverend
18 Col. colonel
19 Mt. mount
20 Ft. fort
21 etc. et cetera

View File

@@ -0,0 +1,23 @@
& and
# hash
@ at
§ section
™ trademark
® registered trademark
© copyright
_ underscore
% percent sign
* asterisk
+ plus
/ slash
= equal sign
^ circumflex
| vertical bar
~ tilde
$ dollar
£ pound
€ euro
₩ won
¥ yen
° degree
º degree
1 & and
2 # hash
3 @ at
4 § section
5 trademark
6 ® registered trademark
7 © copyright
8 _ underscore
9 % percent sign
10 * asterisk
11 + plus
12 / slash
13 = equal sign
14 ^ circumflex
15 | vertical bar
16 ~ tilde
17 $ dollar
18 £ pound
19 euro
20 won
21 ¥ yen
22 ° degree
23 º degree

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
# Copyright 2015 and onwards Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import string
from pathlib import Path
from typing import Dict
import pynini
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini import Far
from pynini.examples import plurals
from pynini.export import export
from pynini.lib import byte, pynutil, utf8
NEMO_CHAR = utf8.VALID_UTF8_CHAR
NEMO_DIGIT = byte.DIGIT
NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter = export.Exporter(file_name)
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
print(f'Created {file_name}')
def get_plurals(fst):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return SINGULAR_TO_PLURAL @ fst
def get_singulars(fst):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> 'pynini.FstLike':
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA)
class GraphFst:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, name: str, kind: str, deterministic: bool = True):
self.name = name
self.kind = str
self._fst = None
self.deterministic = deterministic
self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
def far_exist(self) -> bool:
"""
Returns true if FAR can be loaded
"""
return self.far_path.exists()
@property
def fst(self) -> 'pynini.FstLike':
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
def add_tokens(self, fst) -> 'pynini.FstLike':
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
def delete_tokens(self, fst) -> 'pynini.FstLike':
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res = (
pynutil.delete(f"{self.name}")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ fst
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,50 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_UPPER, GraphFst, insert_space
from pynini.lib import pynutil
class AbbreviationFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. "ABC" -> tokens { abbreviation { value: "A B C" } }
Args:
whitelist: whitelist FST
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True):
super().__init__(name="abbreviation", kind="classify", deterministic=deterministic)
dot = pynini.accep(".")
# A.B.C. -> A. B. C.
graph = NEMO_UPPER + dot + pynini.closure(insert_space + NEMO_UPPER + dot, 1)
# A.B.C. -> A.B.C.
graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
# ABC -> A B C
graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
# exclude words that are included in the whitelist
graph = pynini.compose(
pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph
)
graph = pynutil.insert("value: \"") + graph.optimize() + pynutil.insert("\"")
graph = self.add_tokens(graph)
self.fst = graph.optimize()

View File

@@ -0,0 +1,138 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
NEMO_NOT_QUOTE,
NEMO_SIGMA,
GraphFst,
insert_space,
)
from nemo_text_processing.text_normalization.en.taggers.date import get_four_digit_year_graph
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.examples import plurals
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals, e.g.
-23 -> cardinal { negative: "true" integer: "twenty three" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
self.lm = lm
self.deterministic = deterministic
# TODO replace to have "oh" as a default for "0"
graph = pynini.Far(get_abs_path("data/number/cardinal_number_name.far")).get_fst()
self.graph_hundred_component_at_least_one_none_zero_digit = (
pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))
) @ graph
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
single_digits_graph = pynini.invert(graph_digit | graph_zero)
self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)
if not deterministic:
# for a single token allow only the same normalization
# "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross("0", "oh")
self.single_digits_graph = single_digits_graph_zero + pynini.closure(
insert_space + single_digits_graph_zero
)
self.single_digits_graph |= single_digits_graph_oh + pynini.closure(insert_space + single_digits_graph_oh)
single_digits_graph_with_commas = pynini.closure(
self.single_digits_graph + insert_space, 1, 3
) + pynini.closure(
pynutil.delete(",")
+ single_digits_graph
+ insert_space
+ single_digits_graph
+ insert_space
+ single_digits_graph,
1,
)
optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
graph = (
pynini.closure(NEMO_DIGIT, 1, 3)
+ (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3))
) @ graph
self.graph = graph
self.graph_with_and = self.add_optional_and(graph)
if deterministic:
long_numbers = pynini.compose(NEMO_DIGIT ** (5, ...), self.single_digits_graph).optimize()
final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize()
cardinal_with_leading_zeros = pynini.compose(
pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph
)
final_graph |= cardinal_with_leading_zeros
else:
leading_zeros = pynini.compose(pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
cardinal_with_leading_zeros = (
leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph_with_and)
)
# add small weight to non-default graphs to make sure the deterministic option is listed first
final_graph = (
self.graph_with_and
| pynutil.add_weight(self.single_digits_graph, 0.0001)
| get_four_digit_year_graph() # allows e.g. 4567 be pronouced as forty five sixty seven
| pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
| cardinal_with_leading_zeros
)
final_graph = optional_minus_graph + pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
def add_optional_and(self, graph):
graph_with_and = graph
if not self.lm:
graph_with_and = pynutil.add_weight(graph, 0.00001)
not_quote = pynini.closure(NEMO_NOT_QUOTE)
no_thousand_million = pynini.difference(
not_quote, not_quote + pynini.union("thousand", "million") + not_quote
).optimize()
integer = (
not_quote + pynutil.add_weight(pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001)
).optimize()
no_hundred = pynini.difference(NEMO_SIGMA, not_quote + pynini.accep("hundred") + not_quote).optimize()
integer |= (
not_quote + pynutil.add_weight(pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)
).optimize()
optional_hundred = pynini.compose((NEMO_DIGIT - "0") ** 3, graph).optimize()
optional_hundred = pynini.compose(optional_hundred, NEMO_SIGMA + pynini.cross(" hundred", "") + NEMO_SIGMA)
graph_with_and |= pynini.compose(graph, integer).optimize()
graph_with_and |= optional_hundred
return graph_with_and

View File

@@ -0,0 +1,370 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_LOWER,
NEMO_SIGMA,
NEMO_NOT_QUOTE,
TO_LOWER,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.examples import plurals
from pynini.lib import pynutil
graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/number/teen.tsv"))).optimize()
graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/number/ty.tsv"))).optimize()
year_suffix = load_labels(get_abs_path("data/date/year_suffix.tsv"))
year_suffix.extend(augment_labels_with_punct_at_end(year_suffix))
year_suffix = pynini.string_map(year_suffix).optimize()
def get_ties_graph(deterministic: bool = True):
"""
Returns two digit transducer, e.g.
03 -> o three
12 -> thirteen
20 -> twenty
"""
graph = graph_teen | ties_graph + pynutil.delete("0") | ties_graph + insert_space + graph_digit
if deterministic:
graph = graph | pynini.cross("0", "o") + insert_space + graph_digit
else:
graph = graph | (pynini.cross("0", "o") | pynini.cross("0", "zero")) + insert_space + graph_digit
return graph.optimize()
def get_four_digit_year_graph(deterministic: bool = True):
"""
Returns a four digit transducer which is combination of ties/teen or digits
(using hundred instead of thousand format), e.g.
1219 -> twelve nineteen
3900 -> thirty nine hundred
"""
graph_ties = get_ties_graph(deterministic)
graph_with_s = (
(graph_ties + insert_space + graph_ties)
| (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")))
) + pynutil.delete("0s")
graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s")
graph_with_s = graph_with_s @ pynini.cdrewrite(
pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA
)
graph = graph_ties + insert_space + graph_ties
graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred")
thousand_graph = (
graph_digit
+ insert_space
+ pynini.cross("00", "thousand")
+ (pynutil.delete("0") | insert_space + graph_digit)
)
thousand_graph |= (
graph_digit
+ insert_space
+ pynini.cross("000", "thousand")
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynini.accep("s")
)
graph |= graph_with_s
if deterministic:
graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA)
else:
graph |= thousand_graph
return graph.optimize()
def _get_two_digit_year_with_s_graph():
# to handle '70s -> seventies
graph = (
pynini.closure(pynutil.delete("'"), 0, 1)
+ pynini.compose(
ties_graph + pynutil.delete("0s"), pynini.cdrewrite(pynini.cross("y", "ies"), "", "[EOS]", NEMO_SIGMA)
)
).optimize()
return graph
def _get_year_graph(cardinal_graph, deterministic: bool = True):
"""
Transducer for year, only from 1000 - 2999 e.g.
1290 -> twelve nineteen
2000 - 2009 will be verbalized as two thousand.
Transducer for 3 digit year, e.g. 123-> one twenty three
Transducer for year with suffix
123 A.D., 4200 B.C
"""
graph = get_four_digit_year_graph(deterministic)
graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph
graph |= _get_two_digit_year_with_s_graph()
three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
year_with_suffix = (
(get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix
)
graph |= year_with_suffix
return graph.optimize()
def _get_two_digit_year(cardinal_graph, single_digits_graph):
wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
return wo_digit_year
class DateFst(GraphFst):
"""
Finite state transducer for classifying date, e.g.
jan. 5, 2012 -> date { month: "january" day: "five" year: "twenty twelve" preserve_order: true }
jan. 5 -> date { month: "january" day: "five" preserve_order: true }
5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" preserve_order: true }
2012-01-05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012.01.05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012/01/05 -> date { year: "twenty twelve" month: "january" day: "five" }
2012 -> date { year: "twenty twelve" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
super().__init__(name="date", kind="classify", deterministic=deterministic)
# january
month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize()
# January, JANUARY
month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
TO_LOWER ** (2, ...), month_graph
)
# jan
month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize()
# jan, Jan, JAN
month_abbr_graph = (
month_abbr_graph
| pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
| pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
) + pynini.closure(pynutil.delete("."), 0, 1)
month_graph |= month_abbr_graph.optimize()
month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize()
cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit
year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic)
# three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
# year_graph |= three_digit_year
month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"")
endings = ["rd", "th", "st", "nd"]
endings += [x.upper() for x in endings]
endings = pynini.union(*endings)
day_graph = (
pynutil.insert("day: \"")
+ pynini.closure(pynutil.delete("the "), 0, 1)
+ (
((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
+ pynini.closure(pynutil.delete(endings), 0, 1)
)
@ cardinal_graph
+ pynutil.insert("\"")
)
two_digit_year = _get_two_digit_year(
cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph
)
two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")
# if lm:
# two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year)
# year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph)
# year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph)
graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
graph_year |= (
pynutil.insert(" year: \"")
+ pynini.accep(",")
+ pynini.closure(pynini.accep(" "), 0, 1)
+ year_graph
+ pynutil.insert("\"")
)
optional_graph_year = pynini.closure(graph_year, 0, 1)
year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
graph_mdy = month_graph + (
(delete_extra_space + day_graph)
| (pynini.accep(" ") + day_graph)
| graph_year
| (delete_extra_space + day_graph + graph_year)
)
graph_mdy |= (
month_graph
+ pynini.cross("-", " ")
+ day_graph
+ pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1)
)
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_mdy |= (
month_numbers_graph
+ delete_sep
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
+ delete_sep
+ insert_space
+ (year_graph | two_digit_year)
)
graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_dmy |= (
day_ex_month
+ delete_sep
+ insert_space
+ month_numbers_graph
+ delete_sep
+ insert_space
+ (year_graph | two_digit_year)
)
graph_ymd = pynini.accep("")
for x in ["-", "/", "."]:
delete_sep = pynutil.delete(x)
graph_ymd |= (
(year_graph | two_digit_year)
+ delete_sep
+ insert_space
+ month_numbers_graph
+ delete_sep
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
)
final_graph = graph_mdy | graph_dmy
if not deterministic or lm:
final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
m_sep_d = (
month_numbers_graph
+ pynutil.delete(pynini.union("-", "/"))
+ insert_space
+ pynini.closure(pynutil.delete("0"), 0, 1)
+ day_graph
)
final_graph |= m_sep_d
else:
final_graph += pynutil.insert(" preserve_order: true")
final_graph |= graph_ymd | year_graph
if not deterministic or lm:
ymd_to_mdy_graph = None
ymd_to_dmy_graph = None
mdy_to_dmy_graph = None
md_to_dm_graph = None
for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]:
for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]:
ymd_to_mdy_curr = (
pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
+ pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
)
# YY-MM-DD -> MM-DD-YY
ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
ymd_to_mdy_graph = (
ymd_to_mdy_curr
if ymd_to_mdy_graph is None
else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
)
ymd_to_dmy_curr = (
pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
+ pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
)
# YY-MM-DD -> MM-DD-YY
ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
ymd_to_dmy_graph = (
ymd_to_dmy_curr
if ymd_to_dmy_graph is None
else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
)
mdy_to_dmy_curr = (
pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
+ pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
+ pynini.accep('year:')
+ NEMO_SIGMA
).optimize()
# MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
mdy_to_dmy_graph = (
mdy_to_dmy_curr
if mdy_to_dmy_graph is None
else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
).optimize()
md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
"month: \"" + month + "\" day: \"" + day + "\""
)
md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()
md_to_dm_graph = (
md_to_dm_curr
if md_to_dm_graph is None
else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
).optimize()
final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,129 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, TO_UPPER, GraphFst, get_abs_path
from pynini.lib import pynutil
delete_space = pynutil.delete(" ")
quantities = pynini.string_file(get_abs_path("data/number/thousand.tsv"))
quantities_abbr = pynini.string_file(get_abs_path("data/number/quantity_abbr.tsv"))
quantities_abbr |= TO_UPPER @ quantities_abbr
def get_quantity(
decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike', include_abbr: bool
) -> 'pynini.FstLike':
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. 1 million -> integer_part: "one" quantity: "million"
e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
quantity_wo_thousand = pynini.project(quantities, "input") - pynini.union("k", "K", "thousand")
if include_abbr:
quantity_wo_thousand |= pynini.project(quantities_abbr, "input") - pynini.union("k", "K", "thousand")
res = (
pynutil.insert("integer_part: \"")
+ cardinal_up_to_hundred
+ pynutil.insert("\"")
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynutil.insert(" quantity: \"")
+ (quantity_wo_thousand @ (quantities | quantities_abbr))
+ pynutil.insert("\"")
)
if include_abbr:
quantity = quantities | quantities_abbr
else:
quantity = quantities
res |= (
decimal
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynutil.insert("quantity: \"")
+ quantity
+ pynutil.insert("\"")
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal, e.g.
-12.5006 billion -> decimal { negative: "true" integer_part: "12" fractional_part: "five o o six" quantity: "billion" }
1 billion -> decimal { integer_part: "one" quantity: "billion" }
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst, deterministic: bool):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and
cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
self.graph = cardinal.single_digits_graph.optimize()
if not deterministic:
self.graph = self.graph | cardinal_graph
point = pynutil.delete(".")
optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
self.graph_fractional = pynutil.insert("fractional_part: \"") + self.graph + pynutil.insert("\"")
self.graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
final_graph_wo_sign = (
pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1)
+ point
+ pynutil.insert(" ")
+ self.graph_fractional
)
quantity_w_abbr = get_quantity(
final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=True
)
quantity_wo_abbr = get_quantity(
final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=False
)
self.final_graph_wo_negative_w_abbr = final_graph_wo_sign | quantity_w_abbr
self.final_graph_wo_negative = final_graph_wo_sign | quantity_wo_abbr
# reduce options for non_deterministic and allow either "oh" or "zero", but not combination
if not deterministic:
no_oh_zero = pynini.difference(
NEMO_SIGMA,
(NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA)
| (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA),
).optimize()
no_zero_oh = pynini.difference(
NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA
).optimize()
self.final_graph_wo_negative |= pynini.compose(
self.final_graph_wo_negative,
pynini.cdrewrite(
pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA
),
)
self.final_graph_wo_negative = pynini.compose(self.final_graph_wo_negative, no_oh_zero).optimize()
self.final_graph_wo_negative = pynini.compose(self.final_graph_wo_negative, no_zero_oh).optimize()
final_graph = optional_graph_negative + self.final_graph_wo_negative
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,87 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
GraphFst,
get_abs_path,
insert_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. cdf1@abc.edu -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
accepted_common_domains = pynini.project(
pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
)
all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
username = (
pynutil.insert("username: \"") + all_accepted_symbols + pynutil.insert("\"") + pynini.cross('@', ' ')
)
domain_graph = all_accepted_symbols + pynini.accep('.') + all_accepted_symbols + NEMO_ALPHA
protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "semicolon")) + pynutil.insert(" "))
protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + (
pynini.accep("://") @ protocol_symbols
)
protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols)
protocol_end = pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols
protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end)
domain_graph = (
pynutil.insert("domain: \"")
+ pynini.difference(domain_graph, pynini.project(protocol, "input") + NEMO_SIGMA)
+ pynutil.insert("\"")
)
domain_common_graph = (
pynutil.insert("domain: \"")
+ pynini.difference(
all_accepted_symbols
+ accepted_common_domains
+ pynini.closure(accepted_symbols + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols), 0, 1),
pynini.project(protocol, "input") + NEMO_SIGMA,
)
+ pynutil.insert("\"")
)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
# email
graph = username + domain_graph
# abc.com, abc.com/123-sm
graph |= domain_common_graph
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
graph |= protocol + pynutil.insert(" ") + domain_graph
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,55 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, get_abs_path
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
"23 4/5" ->
tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } }
"23 4/5th" ->
tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph
integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
numerator = (
pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))
)
endings = ["rd", "th", "st", "nd"]
endings += [x.upper() for x in endings]
optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1)
denominator = pynutil.insert("denominator: \"") + cardinal_graph + optional_end + pynutil.insert("\"")
graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator)
graph |= pynini.closure(integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)
)
self.graph = graph
final_graph = self.add_tokens(self.graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,304 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NON_BREAKING_SPACE,
NEMO_SIGMA,
NEMO_SPACE,
NEMO_UPPER,
SINGULAR_TO_PLURAL,
TO_LOWER,
GraphFst,
convert_space,
delete_space,
delete_zero_or_one_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst as OrdinalTagger
from nemo_text_processing.text_normalization.en.taggers.whitelist import get_formats
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as OrdinalVerbalizer
from pynini.examples import plurals
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
-12kg -> measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" }
1kg -> measure { cardinal { integer: "one" } units: "kilogram" }
.5kg -> measure { decimal { fractional_part: "five" } units: "kilograms" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
fraction: FractionFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True):
super().__init__(name="measure", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and)
graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
if not deterministic:
graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv"))
graph_unit |= pynini.compose(
pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit
).optimize()
graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
graph_unit = convert_space(graph_unit)
optional_graph_negative = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
graph_unit2 = (
pynini.cross("/", "per") + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit
)
optional_graph_unit2 = pynini.closure(
delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1,
)
unit_plural = (
pynutil.insert("units: \"")
+ (graph_unit_plural + optional_graph_unit2 | graph_unit2)
+ pynutil.insert("\"")
)
unit_singular = (
pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal.final_graph_wo_negative
+ delete_space
+ pynutil.insert(" } ")
+ unit_plural
)
# support radio FM/AM
subgraph_decimal |= (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ delete_space
+ pynutil.insert(" } ")
+ pynutil.insert("units: \"")
+ pynini.union("AM", "FM")
+ pynutil.insert("\"")
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ ((NEMO_SIGMA - "1") @ cardinal_graph)
+ delete_space
+ pynutil.insert("\"")
+ pynutil.insert(" } ")
+ unit_plural
)
subgraph_cardinal |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ pynini.cross("1", "one")
+ delete_space
+ pynutil.insert("\"")
+ pynutil.insert(" } ")
+ unit_singular
)
unit_graph = (
pynutil.insert("cardinal { integer: \"-\" } units: \"")
+ pynini.cross(pynini.union("/", "per"), "per")
+ delete_zero_or_one_space
+ pynutil.insert(NEMO_NON_BREAKING_SPACE)
+ graph_unit
+ pynutil.insert("\" preserve_order: true")
)
decimal_dash_alpha = (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ pynini.cross('-', '')
+ pynutil.insert(" } units: \"")
+ pynini.closure(NEMO_ALPHA, 1)
+ pynutil.insert("\"")
)
decimal_times = (
pynutil.insert("decimal { ")
+ decimal.final_graph_wo_negative
+ pynutil.insert(" } units: \"")
+ pynini.cross(pynini.union('x', "X"), 'x')
+ pynutil.insert("\"")
)
alpha_dash_decimal = (
pynutil.insert("units: \"")
+ pynini.closure(NEMO_ALPHA, 1)
+ pynini.accep('-')
+ pynutil.insert("\"")
+ pynutil.insert(" decimal { ")
+ decimal.final_graph_wo_negative
+ pynutil.insert(" } preserve_order: true")
)
subgraph_fraction = (
pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural
)
address = self.get_address_graph(cardinal)
address = (
pynutil.insert("units: \"address\" cardinal { integer: \"")
+ address
+ pynutil.insert("\" } preserve_order: true")
)
math_operations = pynini.string_file(get_abs_path("data/measure/math_operation.tsv"))
delimiter = pynini.accep(" ") | pynutil.insert(" ")
math = (
(cardinal_graph | NEMO_ALPHA)
+ delimiter
+ math_operations
+ (delimiter | NEMO_ALPHA)
+ cardinal_graph
+ delimiter
+ pynini.cross("=", "equals")
+ delimiter
+ (cardinal_graph | NEMO_ALPHA)
)
math |= (
(cardinal_graph | NEMO_ALPHA)
+ delimiter
+ pynini.cross("=", "equals")
+ delimiter
+ (cardinal_graph | NEMO_ALPHA)
+ delimiter
+ math_operations
+ delimiter
+ cardinal_graph
)
math = (
pynutil.insert("units: \"math\" cardinal { integer: \"")
+ math
+ pynutil.insert("\" } preserve_order: true")
)
final_graph = (
subgraph_decimal
| subgraph_cardinal
| unit_graph
| decimal_dash_alpha
| decimal_times
| alpha_dash_decimal
| subgraph_fraction
| address
| math
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
def get_range(self, cardinal: GraphFst):
"""
Returns range forms for measure tagger, e.g. 2-3, 2x3, 2*2
Args:
cardinal: cardinal GraphFst
"""
range_graph = cardinal + pynini.cross(pynini.union("-", " - "), " to ") + cardinal
for x in [" x ", "x"]:
range_graph |= cardinal + pynini.cross(x, " by ") + cardinal
if not self.deterministic:
range_graph |= cardinal + pynini.cross(x, " times ") + cardinal
for x in ["*", " * "]:
range_graph |= cardinal + pynini.cross(x, " times ") + cardinal
return range_graph.optimize()
def get_address_graph(self, cardinal):
"""
Finite state transducer for classifying serial.
The serial is a combination of digits, letters and dashes, e.g.:
2788 San Tomas Expy, Santa Clara, CA 95051 ->
units: "address" cardinal
{ integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
preserve_order: true
"""
ordinal_verbalizer = OrdinalVerbalizer().graph
ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
ordinal_num = pynini.compose(
pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer
)
address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
address_num += insert_space + NEMO_DIGIT ** 2 @ (
pynini.closure(pynini.cross("0", "zero "), 0, 1)
+ cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
# to handle the rest of the numbers
address_num = pynini.compose(NEMO_DIGIT ** (3, 4), address_num)
address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA)
direction = (
pynini.cross("E", "East")
| pynini.cross("S", "South")
| pynini.cross("W", "West")
| pynini.cross("N", "North")
) + pynini.closure(pynutil.delete("."), 0, 1)
direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
address_words = get_formats(get_abs_path("data/address/address_word.tsv"))
address_words = (
pynini.accep(NEMO_SPACE)
+ (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1))
+ NEMO_SPACE
+ pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE)
+ address_words
)
city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
city = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)
states = load_labels(get_abs_path("data/address/state.tsv"))
additional_options = []
for x, y in states:
additional_options.append((x, f"{y[0]}.{y[1:]}"))
states.extend(additional_options)
state_graph = pynini.string_map(states)
state = pynini.invert(state_graph)
state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)
zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph)
zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,)
address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1)
address |= address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1)
return address

View File

@@ -0,0 +1,192 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
SINGULAR_TO_PLURAL,
GraphFst,
convert_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.lib import pynutil
min_singular = pynini.string_file(get_abs_path("data/money/currency_minor_singular.tsv"))
min_plural = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv"))
maj_singular = pynini.string_file((get_abs_path("data/money/currency_major.tsv")))
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money, suppletive aware, e.g.
$12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
$12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
$1 -> money { currency_maj: "dollar" integer_part: "one" }
$1.00 -> money { currency_maj: "dollar" integer_part: "one" }
$0.05 -> money { fractional_part: "five" currency_min: "cents" preserve_order: true }
$1 million -> money { currency_maj: "dollars" integer_part: "one" quantity: "million" }
$1.2 million -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two" quantity: "million" }
$1.2320 -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two three two" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph_with_and
graph_decimal_final = decimal.final_graph_wo_negative_w_abbr
maj_singular_labels = load_labels(get_abs_path("data/money/currency_major.tsv"))
maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
maj_unit_singular = convert_space(maj_singular)
graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")
optional_delete_fractional_zeros = pynini.closure(
pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1
)
graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")
# only for decimals where third decimal after comma is non-zero or with quantity
decimal_delete_last_zeros = (
pynini.closure(NEMO_DIGIT | pynutil.delete(","))
+ pynini.accep(".")
+ pynini.closure(NEMO_DIGIT, 2)
+ (NEMO_DIGIT - "0")
+ pynini.closure(pynutil.delete("0"))
)
decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA
graph_decimal = (
graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
)
graph_integer = (
pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")
)
graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
graph_integer_only |= graph_maj_plural + insert_space + graph_integer
final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal
# remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
# e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
# not accepted: 002, 00, 0,
two_digits_fractional_part = (
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))
) @ (
(pynutil.delete("0") + (NEMO_DIGIT - "0"))
| ((NEMO_DIGIT - "0") + pynutil.insert("0"))
| ((NEMO_DIGIT - "0") + NEMO_DIGIT)
)
graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"")
graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"")
# format ** dollars ** cent
decimal_graph_with_minor = None
integer_graph_reordered = None
decimal_default_reordered = None
for curr_symbol, _ in maj_singular_labels:
preserve_order = pynutil.insert(" preserve_order: true")
integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural
integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular
integer_plus_maj_with_comma = pynini.compose(
NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj
)
integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
integer_plus_maj |= integer_plus_maj_with_comma
graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one")
graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"")
graph_fractional = (
two_digits_fractional_part
@ (pynini.closure(NEMO_DIGIT, 1, 2) - "1")
@ cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"")
fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural
fractional_plus_min |= (
graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular
)
decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min
if not deterministic:
decimal_graph_with_minor_curr |= pynutil.add_weight(
integer_plus_maj
+ pynini.cross(".", " ")
+ pynutil.insert("fractional_part: \"")
+ two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
+ pynutil.insert("\""),
weight=0.0001,
)
default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
decimal_graph_with_minor_curr |= (
pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min
)
decimal_graph_with_minor_curr = (
pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order
)
decimal_graph_with_minor = (
decimal_graph_with_minor_curr
if decimal_graph_with_minor is None
else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()
)
if not deterministic:
integer_graph_reordered_curr = (
pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order
).optimize()
integer_graph_reordered = (
integer_graph_reordered_curr
if integer_graph_reordered is None
else pynini.union(integer_graph_reordered, integer_graph_reordered_curr).optimize()
)
decimal_default_reordered_curr = (
pynutil.delete(curr_symbol)
+ default_fraction_graph
+ insert_space
+ pynutil.insert(curr_symbol) @ graph_maj_plural
)
decimal_default_reordered = (
decimal_default_reordered_curr
if decimal_default_reordered is None
else pynini.union(decimal_default_reordered, decimal_default_reordered_curr)
).optimize()
# weight for SH
final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)
if not deterministic:
final_graph |= integer_graph_reordered | decimal_default_reordered
# to handle "$2.00" cases
final_graph |= pynini.compose(
NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered
)
final_graph = self.add_tokens(final_graph.optimize())
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,61 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal, e.g.
13th -> ordinal { integer: "thirteen" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
cardinal_graph = cardinal.graph
cardinal_format = pynini.closure(NEMO_DIGIT | pynini.accep(","))
st_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("1")
+ pynutil.delete(pynini.union("st", "ST"))
)
nd_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("2")
+ pynutil.delete(pynini.union("nd", "ND"))
)
rd_format = (
pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1)
+ pynini.accep("3")
+ pynutil.delete(pynini.union("rd", "RD"))
)
th_format = pynini.closure(
(NEMO_DIGIT - "1" - "2" - "3")
| (cardinal_format + "1" + NEMO_DIGIT)
| (cardinal_format + (NEMO_DIGIT - "1") + (NEMO_DIGIT - "1" - "2" - "3")),
1,
) + pynutil.delete(pynini.union("th", "TH"))
self.graph = (st_format | nd_format | rd_format | th_format) @ cardinal_graph
final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,65 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from unicodedata import category
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.examples import plurals
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="punctuation", kind="classify", deterministic=deterministic)
s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""
punct_symbols_to_exclude = ["[", "]"]
punct_unicode = [
chr(i)
for i in range(sys.maxunicode)
if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude
]
whitelist_symbols = load_labels(get_abs_path("data/whitelist/symbol.tsv"))
whitelist_symbols = [x[0] for x in whitelist_symbols]
self.punct_marks = [p for p in punct_unicode + list(s) if p not in whitelist_symbols]
punct = pynini.union(*self.punct_marks)
punct = pynini.closure(punct, 1)
emphasis = (
pynini.accep("<")
+ (
(pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1))
| (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))
)
+ pynini.accep(">")
)
punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)
self.graph = punct
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()

View File

@@ -0,0 +1,102 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, convert_space
from pynini.lib import pynutil
class RangeFst(GraphFst):
"""
This class is a composite class of two other class instances
Args:
time: composed tagger and verbalizer
date: composed tagger and verbalizer
cardinal: tagger
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
lm: whether to use for hybrid LM
"""
def __init__(
self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False,
):
super().__init__(name="range", kind="classify", deterministic=deterministic)
delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
approx = pynini.cross("~", "approximately")
# TIME
time_graph = time + delete_space + pynini.cross("-", " to ") + delete_space + time
self.graph = time_graph | (approx + time)
cardinal = cardinal.graph_with_and
# YEAR
date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date
year_to_year_graph = (
date_year_four_digit
+ delete_space
+ pynini.cross("-", " to ")
+ delete_space
+ (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal))
)
mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit)
self.graph |= year_to_year_graph
self.graph |= mid_year_graph
# ADDITION
range_graph = cardinal + pynini.closure(pynini.cross("+", " plus ") + cardinal, 1)
range_graph |= cardinal + pynini.closure(pynini.cross(" + ", " plus ") + cardinal, 1)
range_graph |= approx + cardinal
range_graph |= cardinal + (pynini.cross("...", " ... ") | pynini.accep(" ... ")) + cardinal
if not deterministic or lm:
# cardinal ----
cardinal_to_cardinal_graph = (
cardinal + delete_space + pynini.cross("-", pynini.union(" to ", " minus ")) + delete_space + cardinal
)
range_graph |= cardinal_to_cardinal_graph | (
cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal
)
# MULTIPLY
for x in [" x ", "x"]:
range_graph |= cardinal + pynini.closure(
pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1
)
for x in ["*", " * "]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, " times ") + cardinal, 1)
# supports "No. 12" -> "Number 12"
range_graph |= (
(pynini.cross(pynini.union("NO", "No"), "Number") | pynini.cross("no", "number"))
+ pynini.closure(pynini.union(". ", " "), 0, 1)
+ cardinal
)
for x in ["/", " / "]:
range_graph |= cardinal + pynini.closure(pynini.cross(x, " divided by ") + cardinal, 1)
self.graph |= range_graph
self.graph = self.graph.optimize()
graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"")
self.fst = graph.optimize()

View File

@@ -0,0 +1,114 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_SIGMA, GraphFst
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.lib import pynutil
class RomanFst(GraphFst):
"""
Finite state transducer for classifying roman numbers:
e.g. "IV" -> tokens { roman { integer: "four" } }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="roman", kind="classify", deterministic=deterministic)
roman_dict = load_labels(get_abs_path("data/roman/roman_to_spoken.tsv"))
default_graph = pynini.string_map(roman_dict).optimize()
default_graph = pynutil.insert("integer: \"") + default_graph + pynutil.insert("\"")
ordinal_limit = 19
if deterministic:
# exclude "I"
start_idx = 1
else:
start_idx = 0
graph_teens = pynini.string_map([x[0] for x in roman_dict[start_idx:ordinal_limit]]).optimize()
# roman numerals up to ordinal_limit with a preceding name are converted to ordinal form
names = get_names()
graph = (
pynutil.insert("key_the_ordinal: \"")
+ names
+ pynutil.insert("\"")
+ pynini.accep(" ")
+ graph_teens @ default_graph
).optimize()
# single symbol roman numerals with preceding key words (multiple formats) are converted to cardinal form
key_words = []
for k_word in load_labels(get_abs_path("data/roman/key_word.tsv")):
key_words.append(k_word)
key_words.append([k_word[0][0].upper() + k_word[0][1:]])
key_words.append([k_word[0].upper()])
key_words = pynini.string_map(key_words).optimize()
graph |= (
pynutil.insert("key_cardinal: \"") + key_words + pynutil.insert("\"") + pynini.accep(" ") + default_graph
).optimize()
if deterministic or lm:
# two digit roman numerals up to 49
roman_to_cardinal = pynini.compose(
pynini.closure(NEMO_ALPHA, 2),
(
pynutil.insert("default_cardinal: \"default\" ")
+ (pynini.string_map([x[0] for x in roman_dict[:50]]).optimize()) @ default_graph
),
)
graph |= roman_to_cardinal
elif not lm:
# two or more digit roman numerals
roman_to_cardinal = pynini.compose(
pynini.difference(NEMO_SIGMA, "I"),
(
pynutil.insert("default_cardinal: \"default\" integer: \"")
+ pynini.string_map(roman_dict).optimize()
+ pynutil.insert("\"")
),
).optimize()
graph |= roman_to_cardinal
# convert three digit roman or up with suffix to ordinal
roman_to_ordinal = pynini.compose(
pynini.closure(NEMO_ALPHA, 3),
(pynutil.insert("default_ordinal: \"default\" ") + graph_teens @ default_graph + pynutil.delete("th")),
)
graph |= roman_to_ordinal
graph = self.add_tokens(graph.optimize())
self.fst = graph.optimize()
def get_names():
"""
Returns the graph that matched common male and female names.
"""
male_labels = load_labels(get_abs_path("data/roman/male.tsv"))
female_labels = load_labels(get_abs_path("data/roman/female.tsv"))
male_labels.extend([[x[0].upper()] for x in male_labels])
female_labels.extend([[x[0].upper()] for x in female_labels])
names = pynini.string_map(male_labels).optimize()
names |= pynini.string_map(female_labels).optimize()
return names

View File

@@ -0,0 +1,136 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
GraphFst,
convert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
from pynini.examples import plurals
from pynini.lib import pynutil
class SerialFst(GraphFst):
"""
This class is a composite class of two other class instances
Args:
time: composed tagger and verbalizer
date: composed tagger and verbalizer
cardinal: tagger
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
lm: whether to use for hybrid LM
"""
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
super().__init__(name="integer", kind="classify", deterministic=deterministic)
"""
Finite state transducer for classifying serial (handles only cases without delimiters,
values with delimiters are handled by default).
The serial is a combination of digits, letters and dashes, e.g.:
c325b -> tokens { cardinal { integer: "c three two five b" } }
"""
num_graph = pynini.compose(NEMO_DIGIT ** (6, ...), cardinal.single_digits_graph).optimize()
num_graph |= pynini.compose(NEMO_DIGIT ** (1, 5), cardinal.graph).optimize()
# to handle numbers starting with zero
num_graph |= pynini.compose(
pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph
).optimize()
# TODO: "#" doesn't work from the file
symbols_graph = pynini.string_file(get_abs_path("data/whitelist/symbol.tsv")).optimize() | pynini.cross(
"#", "hash"
)
num_graph |= symbols_graph
if not self.deterministic and not lm:
num_graph |= cardinal.single_digits_graph
# also allow double digits to be pronounced as integer in serial number
num_graph |= pynutil.add_weight(
NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001
)
# add space between letter and digit/symbol
symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))]
symbols = pynini.union(*symbols)
digit_symbol = NEMO_DIGIT | symbols
graph_with_space = pynini.compose(
pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA),
pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA),
)
# serial graph with delimiter
delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
if not deterministic:
delimiter |= pynini.cross("-", " dash ") | pynini.cross("/", " slash ")
alphas = pynini.closure(NEMO_ALPHA, 1)
letter_num = alphas + delimiter + num_graph
num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
next_alpha_or_num |= pynini.closure(
delimiter
+ num_graph
+ plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize()
+ alphas
)
serial_graph = letter_num + next_alpha_or_num
serial_graph |= num_letter + next_alpha_or_num
# numbers only with 2+ delimiters
serial_graph |= (
num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)
)
# 2+ symbols
serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph)
# exclude ordinal numbers from serial options
serial_graph = pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph
).optimize()
serial_graph = pynutil.add_weight(serial_graph, 0.0001)
serial_graph |= (
pynini.closure(NEMO_NOT_SPACE, 1)
+ (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()
)
# at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
serial_graph = (
pynini.closure((serial_graph | num_graph | alphas) + delimiter)
+ serial_graph
+ pynini.closure(delimiter + (serial_graph | num_graph | alphas))
)
serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize()
serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize()
# this is not to verbolize "/" as "slash" in cases like "import/export"
serial_graph = pynini.compose(
pynini.difference(
NEMO_SIGMA, pynini.closure(NEMO_ALPHA, 1) + pynini.accep("/") + pynini.closure(NEMO_ALPHA, 1)
),
serial_graph,
)
self.graph = serial_graph.optimize()
graph = pynutil.insert("name: \"") + convert_space(self.graph).optimize() + pynutil.insert("\"")
self.fst = graph.optimize()

View File

@@ -0,0 +1,133 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SIGMA,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
plurals,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension
country code optional: +***
number part: ***-***-****, or (***) ***-****
extension optional: 1-9999
E.g
+1 123-123-5678-1 -> telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" }
1-800-GO-U-HAUL -> telephone { country_code: "one" number_part: "one, eight hundred GO U HAUL" }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
add_separator = pynutil.insert(", ") # between components
zero = pynini.cross("0", "zero")
if not deterministic:
zero |= pynini.cross("0", pynini.union("o", "oh"))
digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() | zero
telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv"))
country_code = (
pynini.closure(telephone_prompts + delete_extra_space, 0, 1)
+ pynini.closure(pynini.cross("+", "plus "), 0, 1)
+ pynini.closure(digit + insert_space, 0, 2)
+ digit
+ pynutil.insert(",")
)
country_code |= telephone_prompts
country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"")
country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
area_part = pynini.cross("800", "eight hundred") | pynini.compose(
pynini.difference(NEMO_SIGMA, "800"), area_part_default
)
area_part = (
(area_part + (pynutil.delete("-") | pynutil.delete(".")))
| (
pynutil.delete("(")
+ area_part
+ ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
)
) + add_separator
del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1)
number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator)) ** 7
number_words = pynini.closure(
(NEMO_DIGIT @ digit) + (insert_space | (pynini.cross("-", ', ')))
| NEMO_ALPHA
| (NEMO_ALPHA + pynini.cross("-", ' '))
)
number_words |= pynini.closure(
(NEMO_DIGIT @ digit) + (insert_space | (pynini.cross(".", ', ')))
| NEMO_ALPHA
| (NEMO_ALPHA + pynini.cross(".", ' '))
)
number_words = pynini.compose(number_length, number_words)
number_part = area_part + number_words
number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"")
extension = (
pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")
)
extension = pynini.closure(insert_space + extension, 0, 1)
graph = plurals._priority_union(country_code + number_part, number_part, NEMO_SIGMA).optimize()
graph = plurals._priority_union(country_code + number_part + extension, graph, NEMO_SIGMA).optimize()
graph = plurals._priority_union(number_part + extension, graph, NEMO_SIGMA).optimize()
# ip
ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv"))
digit_to_str_graph = digit + pynini.closure(pynutil.insert(" ") + digit, 0, 2)
ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph) ** 3
graph |= (
pynini.closure(
pynutil.insert("country_code: \"") + ip_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
)
+ pynutil.insert("number_part: \"")
+ ip_graph.optimize()
+ pynutil.insert("\"")
)
# ssn
ssn_prompts = pynini.string_file(get_abs_path("data/telephone/ssn_prompt.tsv"))
three_digit_part = digit + (pynutil.insert(" ") + digit) ** 2
two_digit_part = digit + pynutil.insert(" ") + digit
four_digit_part = digit + (pynutil.insert(" ") + digit) ** 3
ssn_separator = pynini.cross("-", ", ")
ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part
graph |= (
pynini.closure(
pynutil.insert("country_code: \"") + ssn_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1
)
+ pynutil.insert("number_part: \"")
+ ssn_graph.optimize()
+ pynutil.insert("\"")
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,132 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
GraphFst,
convert_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time, e.g.
12:30 a.m. est -> time { hours: "twelve" minutes: "thirty" suffix: "a m" zone: "e s t" }
2.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
02.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
2.00 a.m. -> time { hours: "two" suffix: "a m" }
2 a.m. -> time { hours: "two" suffix: "a m" }
02:00 -> time { hours: "two" }
2:00 -> time { hours: "two" }
10:00:05 a.m. -> time { hours: "ten" minutes: "zero" seconds: "five" suffix: "a m" }
Args:
cardinal: CardinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="time", kind="classify", deterministic=deterministic)
suffix_labels = load_labels(get_abs_path("data/time/suffix.tsv"))
suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels))
suffix_graph = pynini.string_map(suffix_labels)
time_zone_graph = pynini.string_file(get_abs_path("data/time/zone.tsv"))
# only used for < 1000 thousand -> 0 weight
cardinal = cardinal.graph
labels_hour = [str(x) for x in range(0, 24)]
labels_minute_single = [str(x) for x in range(1, 10)]
labels_minute_double = [str(x) for x in range(10, 60)]
delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT
)
graph_hour = delete_leading_zero_to_double_digit @ pynini.union(*labels_hour) @ cardinal
graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"")
final_graph_minute = (
pynutil.insert("minutes: \"")
+ (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double)
+ pynutil.insert("\"")
)
final_graph_second = (
pynutil.insert("seconds: \"")
+ (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double)
+ pynutil.insert("\"")
)
final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"")
final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1)
final_time_zone_optional = pynini.closure(
delete_space
+ insert_space
+ pynutil.insert("zone: \"")
+ convert_space(time_zone_graph)
+ pynutil.insert("\""),
0,
1,
)
# 2:30 pm, 02:30, 2:00
graph_hm = (
final_graph_hour
+ pynutil.delete(":")
+ (pynutil.delete("00") | insert_space + final_graph_minute)
+ final_suffix_optional
+ final_time_zone_optional
)
# 10:30:05 pm,
graph_hms = (
final_graph_hour
+ pynutil.delete(":")
+ (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute)
+ pynutil.delete(":")
+ (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second)
+ final_suffix_optional
+ final_time_zone_optional
)
# 2.xx pm/am
graph_hm2 = (
final_graph_hour
+ pynutil.delete(".")
+ (pynutil.delete("00") | insert_space + final_graph_minute)
+ delete_space
+ insert_space
+ final_suffix
+ final_time_zone_optional
)
# 2 pm est
graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()

View File

@@ -0,0 +1,201 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.abbreviation import AbbreviationFst
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDateFst
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinalFst
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTimeFst
from pynini.lib import pynutil
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = False,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
else:
start_time = time.time()
cardinal = CardinalFst(deterministic=deterministic)
cardinal_graph = cardinal.fst
start_time = time.time()
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
ordinal_graph = ordinal.fst
start_time = time.time()
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst
start_time = time.time()
fraction = FractionFst(deterministic=deterministic, cardinal=cardinal)
fraction_graph = fraction.fst
start_time = time.time()
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic)
measure_graph = measure.fst
start_time = time.time()
date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst
start_time = time.time()
time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst
start_time = time.time()
telephone_graph = TelephoneFst(deterministic=deterministic).fst
start_time = time.time()
electonic_graph = ElectronicFst(deterministic=deterministic).fst
start_time = time.time()
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst
start_time = time.time()
whitelist_graph = WhiteListFst(
input_case=input_case, deterministic=deterministic, input_file=whitelist
).fst
start_time = time.time()
punctuation = PunctuationFst(deterministic=deterministic)
punct_graph = punctuation.fst
start_time = time.time()
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst
start_time = time.time()
serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic).fst
start_time = time.time()
v_time_graph = vTimeFst(deterministic=deterministic).fst
v_ordinal_graph = vOrdinalFst(deterministic=deterministic)
v_date_graph = vDateFst(ordinal=v_ordinal_graph, deterministic=deterministic).fst
time_final = pynini.compose(time_graph, v_time_graph)
date_final = pynini.compose(date_graph, v_date_graph)
range_graph = RangeFst(
time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic
).fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electonic_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(range_graph, 1.1)
| pynutil.add_weight(serial_graph, 1.1001) # should be higher than the rest of the classes
)
roman_graph = RomanFst(deterministic=deterministic).fst
classify |= pynutil.add_weight(roman_graph, 1.1)
if not deterministic:
abbreviation_graph = AbbreviationFst(deterministic=deterministic).fst
classify |= pynutil.add_weight(abbreviation_graph, 100)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct),
1,
)
classify |= pynutil.add_weight(word_graph, 100)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph = delete_space + graph + delete_space
graph |= punct
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})

View File

@@ -0,0 +1,228 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst as vCardinal
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDate
from nemo_text_processing.text_normalization.en.verbalizers.decimal import DecimalFst as vDecimal
from nemo_text_processing.text_normalization.en.verbalizers.electronic import ElectronicFst as vElectronic
from nemo_text_processing.text_normalization.en.verbalizers.fraction import FractionFst as vFraction
from nemo_text_processing.text_normalization.en.verbalizers.measure import MeasureFst as vMeasure
from nemo_text_processing.text_normalization.en.verbalizers.money import MoneyFst as vMoney
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinal
from nemo_text_processing.text_normalization.en.verbalizers.roman import RomanFst as vRoman
from nemo_text_processing.text_normalization.en.verbalizers.telephone import TelephoneFst as vTelephone
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord
from pynini.examples import plurals
from pynini.lib import pynutil
from nemo.utils import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = True,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != 'None':
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}_lm.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
else:
logging.info(f'Creating ClassifyFst grammars. This might take some time...')
# TAGGERS
cardinal = CardinalFst(deterministic=True, lm=True)
cardinal_tagger = cardinal
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal=cardinal, deterministic=True)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal=cardinal, deterministic=True)
decimal_graph = decimal.fst
fraction = FractionFst(deterministic=True, cardinal=cardinal)
fraction_graph = fraction.fst
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=True)
measure_graph = measure.fst
date = DateFst(cardinal=cardinal, deterministic=True, lm=True)
date_graph = date.fst
punctuation = PunctuationFst(deterministic=True)
punct_graph = punctuation.graph
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
time_graph = TimeFst(cardinal=cardinal, deterministic=True).fst
telephone_graph = TelephoneFst(deterministic=True).fst
electronic_graph = ElectronicFst(deterministic=True).fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=False).fst
whitelist = WhiteListFst(input_case=input_case, deterministic=False, input_file=whitelist)
whitelist_graph = whitelist.graph
serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic, lm=True).fst
# VERBALIZERS
cardinal = vCardinal(deterministic=True)
v_cardinal_graph = cardinal.fst
decimal = vDecimal(cardinal=cardinal, deterministic=True)
v_decimal_graph = decimal.fst
ordinal = vOrdinal(deterministic=True)
v_ordinal_graph = ordinal.fst
fraction = vFraction(deterministic=True, lm=True)
v_fraction_graph = fraction.fst
v_telephone_graph = vTelephone(deterministic=True).fst
v_electronic_graph = vElectronic(deterministic=True).fst
measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=False)
v_measure_graph = measure.fst
v_time_graph = vTime(deterministic=True).fst
v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic, lm=True).fst
v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst
v_roman_graph = vRoman(deterministic=deterministic).fst
v_word_graph = vWord(deterministic=deterministic).fst
cardinal_or_date_final = plurals._priority_union(date_graph, cardinal_graph, NEMO_SIGMA)
cardinal_or_date_final = pynini.compose(cardinal_or_date_final, (v_cardinal_graph | v_date_graph))
time_final = pynini.compose(time_graph, v_time_graph)
ordinal_final = pynini.compose(ordinal_graph, v_ordinal_graph)
sem_w = 1
word_w = 100
punct_w = 2
classify_and_verbalize = (
pynutil.add_weight(time_final, sem_w)
| pynutil.add_weight(pynini.compose(decimal_graph, v_decimal_graph), sem_w)
| pynutil.add_weight(pynini.compose(measure_graph, v_measure_graph), sem_w)
| pynutil.add_weight(ordinal_final, sem_w)
| pynutil.add_weight(pynini.compose(telephone_graph, v_telephone_graph), sem_w)
| pynutil.add_weight(pynini.compose(electronic_graph, v_electronic_graph), sem_w)
| pynutil.add_weight(pynini.compose(fraction_graph, v_fraction_graph), sem_w)
| pynutil.add_weight(pynini.compose(money_graph, v_money_graph), sem_w)
| pynutil.add_weight(cardinal_or_date_final, sem_w)
| pynutil.add_weight(whitelist_graph, sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_word_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()
roman_graph = RomanFst(deterministic=deterministic, lm=True).fst
# the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
classify_and_verbalize |= pynutil.add_weight(pynini.compose(roman_graph, v_roman_graph), sem_w)
date_final = pynini.compose(date_graph, v_date_graph)
range_graph = RangeFst(
time=time_final, cardinal=cardinal_tagger, date=date_final, deterministic=deterministic
).fst
classify_and_verbalize |= pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w)
classify_and_verbalize = pynutil.insert("< ") + classify_and_verbalize + pynutil.insert(" >")
classify_and_verbalize |= pynutil.add_weight(word_graph, word_w)
punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct_only),
1,
)
def get_token_sem_graph(classify_and_verbalize):
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ classify_and_verbalize
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph |= punct_only + pynini.closure(punct)
graph = delete_space + graph + delete_space
remove_extra_spaces = pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(
delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)
)
remove_extra_spaces |= (
pynini.closure(pynutil.delete(" "), 1)
+ pynini.closure(NEMO_NOT_SPACE, 1)
+ pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))
)
graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize()
return graph
self.fst = get_token_sem_graph(classify_and_verbalize)
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f'ClassifyFst grammars are saved to {far_file}.')

View File

@@ -0,0 +1,229 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.abbreviation import AbbreviationFst
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.taggers.date import DateFst
from nemo_text_processing.text_normalization.en.taggers.decimal import DecimalFst
from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst
from nemo_text_processing.text_normalization.en.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.en.taggers.measure import MeasureFst
from nemo_text_processing.text_normalization.en.taggers.money import MoneyFst
from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.en.taggers.range import RangeFst as RangeFst
from nemo_text_processing.text_normalization.en.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.en.taggers.serial import SerialFst
from nemo_text_processing.text_normalization.en.taggers.telephone import TelephoneFst
from nemo_text_processing.text_normalization.en.taggers.time import TimeFst
from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.en.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.verbalizers.abbreviation import AbbreviationFst as vAbbreviation
from nemo_text_processing.text_normalization.en.verbalizers.cardinal import CardinalFst as vCardinal
from nemo_text_processing.text_normalization.en.verbalizers.date import DateFst as vDate
from nemo_text_processing.text_normalization.en.verbalizers.decimal import DecimalFst as vDecimal
from nemo_text_processing.text_normalization.en.verbalizers.electronic import ElectronicFst as vElectronic
from nemo_text_processing.text_normalization.en.verbalizers.fraction import FractionFst as vFraction
from nemo_text_processing.text_normalization.en.verbalizers.measure import MeasureFst as vMeasure
from nemo_text_processing.text_normalization.en.verbalizers.money import MoneyFst as vMoney
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as vOrdinal
from nemo_text_processing.text_normalization.en.verbalizers.roman import RomanFst as vRoman
from nemo_text_processing.text_normalization.en.verbalizers.telephone import TelephoneFst as vTelephone
from nemo_text_processing.text_normalization.en.verbalizers.time import TimeFst as vTime
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst as vWord
from pynini.lib import pynutil
from nemo.utils import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with whitelist replacements
"""
def __init__(
self,
input_case: str,
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = True,
whitelist: str = None,
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != 'None':
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
else:
logging.info(f'Creating ClassifyFst grammars. This might take some time...')
# TAGGERS
cardinal = CardinalFst(deterministic=deterministic)
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
deterministic_ordinal = OrdinalFst(cardinal=cardinal, deterministic=True)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
decimal_graph = decimal.fst
fraction = FractionFst(deterministic=deterministic, cardinal=cardinal)
fraction_graph = fraction.fst
measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic)
measure_graph = measure.fst
date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst
punctuation = PunctuationFst(deterministic=True)
punct_graph = punctuation.graph
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst
telephone_graph = TelephoneFst(deterministic=deterministic).fst
electronic_graph = ElectronicFst(deterministic=deterministic).fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst
whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = whitelist.graph
serial_graph = SerialFst(cardinal=cardinal, ordinal=deterministic_ordinal, deterministic=deterministic).fst
# VERBALIZERS
cardinal = vCardinal(deterministic=deterministic)
v_cardinal_graph = cardinal.fst
decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
v_decimal_graph = decimal.fst
ordinal = vOrdinal(deterministic=deterministic)
v_ordinal_graph = ordinal.fst
fraction = vFraction(deterministic=deterministic)
v_fraction_graph = fraction.fst
v_telephone_graph = vTelephone(deterministic=deterministic).fst
v_electronic_graph = vElectronic(deterministic=deterministic).fst
measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic)
v_measure_graph = measure.fst
v_time_graph = vTime(deterministic=deterministic).fst
v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst
v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst
v_roman_graph = vRoman(deterministic=deterministic).fst
v_abbreviation = vAbbreviation(deterministic=deterministic).fst
det_v_time_graph = vTime(deterministic=True).fst
det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True), deterministic=True).fst
time_final = pynini.compose(time_graph, det_v_time_graph)
date_final = pynini.compose(date_graph, det_v_date_graph)
range_graph = RangeFst(
time=time_final, date=date_final, cardinal=CardinalFst(deterministic=True), deterministic=deterministic
).fst
v_word_graph = vWord(deterministic=deterministic).fst
sem_w = 1
word_w = 100
punct_w = 2
classify_and_verbalize = (
pynutil.add_weight(whitelist_graph, sem_w)
| pynutil.add_weight(pynini.compose(time_graph, v_time_graph), sem_w)
| pynutil.add_weight(pynini.compose(decimal_graph, v_decimal_graph), sem_w)
| pynutil.add_weight(pynini.compose(measure_graph, v_measure_graph), sem_w)
| pynutil.add_weight(pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
| pynutil.add_weight(pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
| pynutil.add_weight(pynini.compose(telephone_graph, v_telephone_graph), sem_w)
| pynutil.add_weight(pynini.compose(electronic_graph, v_electronic_graph), sem_w)
| pynutil.add_weight(pynini.compose(fraction_graph, v_fraction_graph), sem_w)
| pynutil.add_weight(pynini.compose(money_graph, v_money_graph), sem_w)
| pynutil.add_weight(word_graph, word_w)
| pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01)
| pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w)
| pynutil.add_weight(
pynini.compose(serial_graph, v_word_graph), 1.1001
) # should be higher than the rest of the classes
).optimize()
if not deterministic:
roman_graph = RomanFst(deterministic=deterministic).fst
# the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
classify_and_verbalize |= pynutil.add_weight(pynini.compose(roman_graph, v_roman_graph), word_w)
abbreviation_graph = AbbreviationFst(whitelist=whitelist, deterministic=deterministic).fst
classify_and_verbalize |= pynutil.add_weight(
pynini.compose(abbreviation_graph, v_abbreviation), word_w
)
punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct_only),
1,
)
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ classify_and_verbalize
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
)
+ token_plus_punct
)
graph |= punct_only + pynini.closure(punct)
graph = delete_space + graph + delete_space
remove_extra_spaces = pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(
delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)
)
remove_extra_spaces |= (
pynini.closure(pynutil.delete(" "), 1)
+ pynini.closure(NEMO_NOT_SPACE, 1)
+ pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))
)
graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize()
self.fst = graph
no_digits = pynini.closure(pynini.difference(NEMO_CHAR, NEMO_DIGIT))
self.fst_no_digits = pynini.compose(graph, no_digits).optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f'ClassifyFst grammars are saved to {far_file}.')

View File

@@ -0,0 +1,151 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_CHAR,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_UPPER,
SINGULAR_TO_PLURAL,
GraphFst,
convert_space,
)
from nemo_text_processing.text_normalization.en.taggers.roman import get_names
from nemo_text_processing.text_normalization.en.utils import (
augment_labels_with_punct_at_end,
get_abs_path,
load_labels,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelist, e.g.
misses -> tokens { name: "mrs" }
for non-deterministic case: "Dr. Abc" ->
tokens { name: "drive" } tokens { name: "Abc" }
tokens { name: "doctor" } tokens { name: "Abc" }
tokens { name: "Dr." } tokens { name: "Abc" }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
input_file: path to a file with whitelist replacements
"""
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False):
whitelist = load_labels(file)
if input_case == "lower_cased":
whitelist = [[x.lower(), y] for x, y in whitelist]
else:
whitelist = [[x, y] for x, y in whitelist]
if keep_punct_add_end:
whitelist.extend(augment_labels_with_punct_at_end(whitelist))
graph = pynini.string_map(whitelist)
return graph
graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv"))
graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/UK_to_US.tsv")) # Jiayu 2022.10
graph |= pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.accep("/")).optimize(),
_get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")),
).optimize()
if deterministic:
names = get_names()
graph |= (
pynini.cross(pynini.union("st", "St", "ST"), "Saint")
+ pynini.closure(pynutil.delete("."))
+ pynini.accep(" ")
+ names
)
else:
graph |= _get_whitelist_graph(
input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True
)
for x in [".", ". "]:
graph |= (
NEMO_UPPER
+ pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2)
+ pynini.closure(pynutil.delete("."), 0, 1)
)
if not deterministic:
multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv"))
graph |= multiple_forms_whitelist_graph
graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file(
get_abs_path("data/measure/unit_alternatives.tsv")
)
graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural))
graph |= units_graph
# convert to states only if comma is present before the abbreviation to avoid converting all caps words,
# e.g. "IN", "OH", "OK"
# TODO or only exclude above?
states = load_labels(get_abs_path("data/address/state.tsv"))
additional_options = []
for x, y in states:
if input_case == "lower_cased":
x = x.lower()
additional_options.append((x, f"{y[0]}.{y[1:]}"))
if not deterministic:
additional_options.append((x, f"{y[0]}.{y[1:]}."))
states.extend(additional_options)
state_graph = pynini.string_map(states)
graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize()
if input_file:
whitelist_provided = _get_whitelist_graph(input_case, input_file)
if not deterministic:
graph |= whitelist_provided
else:
graph = whitelist_provided
self.graph = (convert_space(graph)).optimize()
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def get_formats(input_f, input_case="cased", is_default=True):
"""
Adds various abbreviation format options to the list of acceptable input forms
"""
multiple_formats = load_labels(input_f)
additional_options = []
for x, y in multiple_formats:
if input_case == "lower_cased":
x = x.lower()
additional_options.append((f"{x}.", y)) # default "dr" -> doctor, this includes period "dr." -> doctor
additional_options.append((f"{x[0].upper() + x[1:]}", f"{y[0].upper() + y[1:]}")) # "Dr" -> Doctor
additional_options.append((f"{x[0].upper() + x[1:]}.", f"{y[0].upper() + y[1:]}")) # "Dr." -> Doctor
multiple_formats.extend(additional_options)
if not is_default:
multiple_formats = [(x, f"|raw_start|{x}|raw_end||norm_start|{y}|norm_end|") for (x, y) in multiple_formats]
multiple_formats = pynini.string_map(multiple_formats)
return multiple_formats

View File

@@ -0,0 +1,90 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
MIN_NEG_WEIGHT,
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
GraphFst,
convert_space,
get_abs_path,
)
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from pynini.examples import plurals
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for classifying word. Considers sentence boundary exceptions.
e.g. sleep -> tokens { name: "sleep" }
Args:
punctuation: PunctuationFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, punctuation: GraphFst, deterministic: bool = True):
super().__init__(name="word", kind="classify", deterministic=deterministic)
punct = PunctuationFst().graph
default_graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)
symbols_to_exclude = (pynini.union("$", "", "", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)
graph = pynutil.add_weight(graph, MIN_NEG_WEIGHT) | default_graph
# leave phones of format [HH AH0 L OW1] untouched
phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
phoneme = (
pynini.accep(pynini.escape("["))
+ pynini.closure(phoneme_unit + pynini.accep(" "))
+ phoneme_unit
+ pynini.accep(pynini.escape("]"))
)
# leave IPA phones of format [ˈdoʊv] untouched, single words and sentences with punctuation marks allowed
punct_marks = pynini.union(*punctuation.punct_marks).optimize()
stress = pynini.union("ˈ", "'", "ˌ")
ipa_phoneme_unit = pynini.string_file(get_abs_path("data/whitelist/ipa_symbols.tsv"))
# word in ipa form
ipa_phonemes = (
pynini.closure(stress, 0, 1)
+ pynini.closure(ipa_phoneme_unit, 1)
+ pynini.closure(stress | ipa_phoneme_unit)
)
# allow sentences of words in IPA format separated with spaces or punct marks
delim = (punct_marks | pynini.accep(" ")) ** (1, ...)
ipa_phonemes = ipa_phonemes + pynini.closure(delim + ipa_phonemes) + pynini.closure(delim, 0, 1)
ipa_phonemes = (pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]"))).optimize()
if not deterministic:
phoneme = (
pynini.accep(pynini.escape("["))
+ pynini.closure(pynini.accep(" "), 0, 1)
+ pynini.closure(phoneme_unit + pynini.accep(" "))
+ phoneme_unit
+ pynini.closure(pynini.accep(" "), 0, 1)
+ pynini.accep(pynini.escape("]"))
).optimize()
ipa_phonemes = (
pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]"))
).optimize()
phoneme |= ipa_phonemes
self.graph = plurals._priority_union(convert_space(phoneme.optimize()), graph, NEMO_SIGMA)
self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()

View File

@@ -0,0 +1,60 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import os
def get_abs_path(rel_path):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
def load_labels(abs_path):
"""
loads relative path file as dictionary
Args:
abs_path: absolute path
Returns dictionary of mappings
"""
label_tsv = open(abs_path, encoding="utf-8")
labels = list(csv.reader(label_tsv, delimiter="\t"))
return labels
def augment_labels_with_punct_at_end(labels):
"""
augments labels: if key ends on a punctuation that value does not have, add a new label
where the value maintains the punctuation
Args:
labels : input labels
Returns:
additional labels
"""
res = []
for label in labels:
if len(label) > 1:
if label[0][-1] == "." and label[1][-1] != ".":
res.append([label[0], label[1] + "."] + label[2:])
return res

View File

@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,35 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class AbbreviationFst(GraphFst):
"""
Finite state transducer for verbalizing abbreviations
e.g. tokens { abbreviation { value: "A B C" } } -> "ABC"
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="abbreviation", kind="verbalize", deterministic=deterministic)
graph = pynutil.delete("value: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,45 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal, e.g.
cardinal { negative: "true" integer: "23" } -> minus twenty three
Args:
deterministic: if True will provide a single transduction option,
for False multiple options (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
self.optional_sign = pynini.cross("negative: \"true\"", "minus ")
if not deterministic:
self.optional_sign |= pynini.cross("negative: \"true\"", "negative ")
self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
integer = pynini.closure(NEMO_NOT_QUOTE)
self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
integer = pynutil.delete("integer:") + self.integer
self.numbers = self.optional_sign + integer
delete_tokens = self.delete_tokens(self.numbers)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,101 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.examples import plurals
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { month: "february" day: "five" year: "twenty twelve" preserve_order: true } -> february fifth twenty twelve
date { day: "five" month: "february" year: "twenty twelve" preserve_order: true } -> the fifth of february twenty twelve
Args:
ordinal: OrdinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
month = pynini.closure(NEMO_NOT_QUOTE, 1)
day_cardinal = (
pynutil.delete("day:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
day = day_cardinal @ ordinal.suffix
month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"")
year = (
pynutil.delete("year:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ delete_space
+ pynutil.delete("\"")
)
# month (day) year
graph_mdy = (
month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1)
)
# may 5 -> may five
if not deterministic and not lm:
graph_mdy |= (
month
+ pynini.closure(delete_extra_space + day_cardinal, 0, 1)
+ pynini.closure(delete_extra_space + year, 0, 1)
)
# day month year
graph_dmy = (
pynutil.insert("the ")
+ day
+ delete_extra_space
+ pynutil.insert("of ")
+ month
+ pynini.closure(delete_extra_space + year, 0, 1)
)
optional_preserve_order = pynini.closure(
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
| pynutil.delete("field_order:")
+ delete_space
+ pynutil.delete("\"")
+ NEMO_NOT_QUOTE
+ pynutil.delete("\"")
+ delete_space
)
final_graph = (
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year)
+ delete_space
+ optional_preserve_order
)
delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,67 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "twelve" fractional_part: "five o o six" quantity: "billion" } -> minus twelve point five o o six billion
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, cardinal, deterministic: bool = True):
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
self.optional_sign = pynini.cross("negative: \"true\"", "minus ")
if not deterministic:
self.optional_sign |= pynini.cross("negative: \"true\"", "negative ")
self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
self.integer = pynutil.delete("integer_part:") + cardinal.integer
self.optional_integer = pynini.closure(self.integer + delete_space + insert_space, 0, 1)
self.fractional_default = (
pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
self.fractional = pynutil.insert("point ") + self.fractional_default
self.quantity = (
delete_space
+ insert_space
+ pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
self.optional_quantity = pynini.closure(self.quantity, 0, 1)
graph = self.optional_sign + (
self.integer
| (self.integer + self.quantity)
| (self.optional_integer + self.fractional + self.optional_quantity)
)
self.numbers = graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,97 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_NOT_SPACE,
NEMO_SIGMA,
TO_UPPER,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.examples import plurals
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> c d f one at a b c dot e d u
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
graph_zero = pynini.cross("0", "zero")
if not deterministic:
graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")
graph_digit = graph_digit_no_zero | graph_zero
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
default_chars_symbols = pynini.cdrewrite(
pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
)
default_chars_symbols = pynini.compose(
pynini.closure(NEMO_NOT_SPACE), default_chars_symbols.optimize()
).optimize()
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete("\"")
+ default_chars_symbols
+ pynutil.delete("\"")
)
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
domain = (
default_chars_symbols
+ insert_space
+ plurals._priority_union(
domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
)
+ pynini.closure(
insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
)
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete("\"")
+ domain
+ delete_space
+ pynutil.delete("\"")
).optimize()
protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
graph = (
pynini.closure(protocol + delete_space, 0, 1)
+ pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1)
+ domain
+ delete_space
).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,88 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst
from pynini.examples import plurals
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for verbalizing fraction
e.g. tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } ->
twenty three and four fifth
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
suffix = OrdinalFst().suffix
integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
denominator_one = pynini.cross("denominator: \"one\"", "over one")
denominator_half = pynini.cross("denominator: \"two\"", "half")
denominator_quarter = pynini.cross("denominator: \"four\"", "quarter")
denominator_rest = (
pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) @ suffix + pynutil.delete("\"")
)
denominators = plurals._priority_union(
denominator_one,
plurals._priority_union(
denominator_half,
plurals._priority_union(denominator_quarter, denominator_rest, NEMO_SIGMA),
NEMO_SIGMA,
),
NEMO_SIGMA,
).optimize()
if not deterministic:
denominators |= pynutil.delete("denominator: \"") + (pynini.accep("four") @ suffix) + pynutil.delete("\"")
numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ")
numerator_one = numerator_one + insert_space + denominators
numerator_rest = (
pynutil.delete("numerator: \"")
+ (pynini.closure(NEMO_NOT_QUOTE) - pynini.accep("one"))
+ pynutil.delete("\" ")
)
numerator_rest = numerator_rest + insert_space + denominators
numerator_rest @= pynini.cdrewrite(
plurals._priority_union(pynini.cross("half", "halves"), pynutil.insert("s"), NEMO_SIGMA),
"",
"[EOS]",
NEMO_SIGMA,
)
graph = numerator_one | numerator_rest
conjunction = pynutil.insert("and ")
if not deterministic and not lm:
conjunction = pynini.closure(conjunction, 0, 1)
integer = pynini.closure(integer + insert_space + conjunction, 0, 1)
graph = integer + graph
graph @= pynini.cdrewrite(
pynini.cross("and one half", "and a half") | pynini.cross("over ones", "over one"), "", "[EOS]", NEMO_SIGMA
)
self.graph = graph
delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,102 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { negative: "true" cardinal { integer: "twelve" } units: "kilograms" } -> minus twelve kilograms
measure { decimal { integer_part: "twelve" fractional_part: "five" } units: "kilograms" } -> twelve point five kilograms
tokens { measure { units: "covid" decimal { integer_part: "nineteen" fractional_part: "five" } } } -> covid nineteen point five
Args:
decimal: DecimalFst
cardinal: CardinalFst
fraction: FractionFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True):
super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
optional_sign = cardinal.optional_sign
unit = (
pynutil.delete("units: \"")
+ pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("address", "math"))
+ pynutil.delete("\"")
+ delete_space
)
if not deterministic:
unit |= pynini.compose(unit, pynini.cross(pynini.union("inch", "inches"), "\""))
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_fraction = (
pynutil.delete("fraction {") + delete_space + fraction.graph + delete_space + pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + insert_space + unit
# SH adds "preserve_order: true" by default
preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
graph |= unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order)
# for only unit
graph |= (
pynutil.delete("cardinal { integer: \"-\"")
+ delete_space
+ pynutil.delete("}")
+ delete_space
+ unit
+ pynini.closure(preserve_order)
)
address = (
pynutil.delete("units: \"address\" ")
+ delete_space
+ graph_cardinal
+ delete_space
+ pynini.closure(preserve_order)
)
math = (
pynutil.delete("units: \"math\" ")
+ delete_space
+ graph_cardinal
+ delete_space
+ pynini.closure(preserve_order)
)
graph |= address | math
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,71 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
GraphFst,
delete_extra_space,
delete_preserve_order,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "twelve" fractional_part: "o five" currency: "dollars" } -> twelve o five dollars
Args:
decimal: DecimalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, decimal: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="verbalize", deterministic=deterministic)
keep_space = pynini.accep(" ")
maj = pynutil.delete("currency_maj: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
min = pynutil.delete("currency_min: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
fractional_part = (
pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
)
integer_part = decimal.integer
# *** currency_maj
graph_integer = integer_part + keep_space + maj
# *** currency_maj + (***) | ((and) *** current_min)
fractional = fractional_part + delete_extra_space + min
if not deterministic:
fractional |= pynutil.insert("and ") + fractional
graph_integer_with_minor = integer_part + keep_space + maj + keep_space + fractional + delete_preserve_order
# *** point *** currency_maj
graph_decimal = decimal.numbers + keep_space + maj
# *** current_min
graph_minor = fractional_part + delete_extra_space + min + delete_preserve_order
graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor
if not deterministic:
graph |= graph_integer + delete_preserve_order
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,53 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space
from nemo_text_processing.text_normalization.en.utils import get_abs_path
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "thirteen" } } -> thirteenth
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
graph_digit = pynini.string_file(get_abs_path("data/ordinal/digit.tsv")).invert()
graph_teens = pynini.string_file(get_abs_path("data/ordinal/teen.tsv")).invert()
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
convert_rest = pynutil.insert("th")
suffix = pynini.cdrewrite(
graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, "", "[EOS]", NEMO_SIGMA,
).optimize()
self.graph = pynini.compose(graph, suffix)
self.suffix = suffix
delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,180 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
MIN_NEG_WEIGHT,
NEMO_ALPHA,
NEMO_CHAR,
NEMO_SIGMA,
NEMO_SPACE,
generator_main,
)
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from pynini.lib import pynutil
class PostProcessingFst:
"""
Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g.
removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)"
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "en_tn_post_processing.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
else:
self.set_punct_dict()
self.fst = self.get_punct_postprocess_graph()
if far_file:
generator_main(far_file, {"post_process_graph": self.fst})
def set_punct_dict(self):
self.punct_marks = {
"'": [
"'",
'´',
'ʹ',
'ʻ',
'ʼ',
'ʽ',
'ʾ',
'ˈ',
'ˊ',
'ˋ',
'˴',
'ʹ',
'΄',
'՚',
'՝',
'י',
'׳',
'ߴ',
'ߵ',
'',
'',
'',
'᾿',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'𖽑',
'𖽒',
],
}
def get_punct_postprocess_graph(self):
"""
Returns graph to post process punctuation marks.
{``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
"""
punct_marks_all = PunctuationFst().punct_marks
# no_space_before_punct assume no space before them
quotes = ["'", "\"", "``", "«"]
dashes = ["-", ""]
brackets = ["<", "{", "("]
open_close_single_quotes = [
("`", "`"),
]
open_close_double_quotes = [('"', '"'), ("``", "``"), ("", "")]
open_close_symbols = open_close_single_quotes + open_close_double_quotes
allow_space_before_punct = ["&"] + quotes + dashes + brackets + [k[0] for k in open_close_symbols]
no_space_before_punct = [m for m in punct_marks_all if m not in allow_space_before_punct]
no_space_before_punct = pynini.union(*no_space_before_punct)
no_space_after_punct = pynini.union(*brackets)
delete_space = pynutil.delete(" ")
delete_space_optional = pynini.closure(delete_space, 0, 1)
# non_punct allows space
# delete space before no_space_before_punct marks, if present
non_punct = pynini.difference(NEMO_CHAR, no_space_before_punct).optimize()
graph = (
pynini.closure(non_punct)
+ pynini.closure(
no_space_before_punct | pynutil.add_weight(delete_space + no_space_before_punct, MIN_NEG_WEIGHT)
)
+ pynini.closure(non_punct)
)
graph = pynini.closure(graph).optimize()
graph = pynini.compose(
graph, pynini.cdrewrite(pynini.cross("``", '"'), "", "", NEMO_SIGMA).optimize()
).optimize()
# remove space after no_space_after_punct (even if there are no matching closing brackets)
no_space_after_punct = pynini.cdrewrite(delete_space, no_space_after_punct, NEMO_SIGMA, NEMO_SIGMA).optimize()
graph = pynini.compose(graph, no_space_after_punct).optimize()
# remove space around text in quotes
single_quote = pynutil.add_weight(pynini.accep("`"), MIN_NEG_WEIGHT)
double_quotes = pynutil.add_weight(pynini.accep('"'), MIN_NEG_WEIGHT)
quotes_graph = (
single_quote + delete_space_optional + NEMO_ALPHA + NEMO_SIGMA + delete_space_optional + single_quote
).optimize()
# this is to make sure multiple quotes are tagged from right to left without skipping any quotes in the left
not_alpha = pynini.difference(NEMO_CHAR, NEMO_ALPHA).optimize() | pynutil.add_weight(
NEMO_SPACE, MIN_NEG_WEIGHT
)
end = pynini.closure(pynutil.add_weight(not_alpha, MIN_NEG_WEIGHT))
quotes_graph |= (
double_quotes
+ delete_space_optional
+ NEMO_ALPHA
+ NEMO_SIGMA
+ delete_space_optional
+ double_quotes
+ end
)
quotes_graph = pynutil.add_weight(quotes_graph, MIN_NEG_WEIGHT)
quotes_graph = NEMO_SIGMA + pynini.closure(NEMO_SIGMA + quotes_graph + NEMO_SIGMA)
graph = pynini.compose(graph, quotes_graph).optimize()
# remove space between a word and a single quote followed by s
remove_space_around_single_quote = pynini.cdrewrite(
delete_space_optional + pynini.union(*self.punct_marks["'"]) + delete_space,
NEMO_ALPHA,
pynini.union("s ", "s[EOS]"),
NEMO_SIGMA,
)
graph = pynini.compose(graph, remove_space_around_single_quote).optimize()
return graph

View File

@@ -0,0 +1,68 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst
from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst
from pynini.lib import pynutil
class RomanFst(GraphFst):
"""
Finite state transducer for verbalizing roman numerals
e.g. tokens { roman { integer: "one" } } -> one
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="roman", kind="verbalize", deterministic=deterministic)
suffix = OrdinalFst().suffix
cardinal = pynini.closure(NEMO_NOT_QUOTE)
ordinal = pynini.compose(cardinal, suffix)
graph = (
pynutil.delete("key_cardinal: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
+ pynini.accep(" ")
+ pynutil.delete("integer: \"")
+ cardinal
+ pynutil.delete("\"")
).optimize()
graph |= (
pynutil.delete("default_cardinal: \"default\" integer: \"") + cardinal + pynutil.delete("\"")
).optimize()
graph |= (
pynutil.delete("default_ordinal: \"default\" integer: \"") + ordinal + pynutil.delete("\"")
).optimize()
graph |= (
pynutil.delete("key_the_ordinal: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
+ pynini.accep(" ")
+ pynutil.delete("integer: \"")
+ pynini.closure(pynutil.insert("the "), 0, 1)
+ ordinal
+ pynutil.delete("\"")
).optimize()
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,63 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone numbers, e.g.
telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" }
-> one, one two three, one two three, five six seven eight, one
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="telephone", kind="verbalize", deterministic=deterministic)
optional_country_code = pynini.closure(
pynutil.delete("country_code: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
+ delete_space
+ insert_space,
0,
1,
)
number_part = (
pynutil.delete("number_part: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1)
+ pynutil.delete("\"")
)
optional_extension = pynini.closure(
delete_space
+ insert_space
+ pynutil.delete("extension: \"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\""),
0,
1,
)
graph = optional_country_code + number_part + optional_extension
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

View File

@@ -0,0 +1,102 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pynini
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time, e.g.
time { hours: "twelve" minutes: "thirty" suffix: "a m" zone: "e s t" } -> twelve thirty a m e s t
time { hours: "twelve" } -> twelve o'clock
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="time", kind="verbalize", deterministic=deterministic)
hour = (
pynutil.delete("hours:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
minute = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
suffix = (
pynutil.delete("suffix:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
optional_suffix = pynini.closure(delete_space + insert_space + suffix, 0, 1)
zone = (
pynutil.delete("zone:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1)
second = (
pynutil.delete("seconds:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)
graph_hms = (
hour
+ pynutil.insert(" hours ")
+ delete_space
+ minute
+ pynutil.insert(" minutes and ")
+ delete_space
+ second
+ pynutil.insert(" seconds")
+ optional_suffix
+ optional_zone
)
graph_hms @= pynini.cdrewrite(
pynutil.delete("o ")
| pynini.cross("one minutes", "one minute")
| pynini.cross("one seconds", "one second")
| pynini.cross("one hours", "one hour"),
pynini.union(" ", "[BOS]"),
"",
NEMO_SIGMA,
)
graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone
graph |= hour + insert_space + pynutil.insert("o'clock") + optional_zone
graph |= hour + delete_space + insert_space + suffix + optional_zone
graph |= graph_hms
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()

Some files were not shown because too many files have changed in this diff Show More