204 lines
6.2 KiB
Python
204 lines
6.2 KiB
Python
from parsivar import Normalizer
|
||
|
||
import num2fawords
|
||
import re
|
||
import string
|
||
|
||
|
||
_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
|
||
chars_to_ignore = [
|
||
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "<EFBFBD>",
|
||
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
|
||
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "<EFBFBD>", "–", "…", "_", "”", '“', '„',
|
||
'ā', 'š', 'ّ', 'ْ',
|
||
]
|
||
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
|
||
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
|
||
zwnj = "\u200c"
|
||
silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
|
||
|
||
|
||
def multiple_replace(text, chars_to_mapping):
|
||
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
||
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
||
|
||
|
||
def remove_special_characters(text, chars_to_ignore_regex):
|
||
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
|
||
return text
|
||
|
||
|
||
def convert_word_nums_to_text(word):
|
||
try:
|
||
word = int(word)
|
||
word = num2fawords.words(word)
|
||
except:
|
||
word = word
|
||
|
||
return word
|
||
|
||
|
||
def normalizer_at_word_level(text):
|
||
words = text.split()
|
||
_text = []
|
||
|
||
for word in words:
|
||
word = convert_word_nums_to_text(word)
|
||
word = fixator_dictionary.get(word, word)
|
||
|
||
_text.append(word)
|
||
|
||
return " ".join(_text) + " "
|
||
|
||
|
||
def finder(ss, s, starter=False):
|
||
found = []
|
||
for m in re.finditer(ss, s):
|
||
if starter:
|
||
found.append(m.start())
|
||
else:
|
||
found.append((m.start(), m.end()))
|
||
|
||
return found
|
||
|
||
|
||
def substring_replace(ss, s, start, end, stripped=True):
|
||
s_start = s[:start]
|
||
s_end = s[end:]
|
||
|
||
counter = 0
|
||
if stripped:
|
||
counter = 1 if s_start.endswith(" ") else counter
|
||
s_start = s_start.rstrip()
|
||
|
||
return s_start + ss + s_end, counter
|
||
|
||
|
||
def normalizer(
|
||
batch,
|
||
is_normalize=True,
|
||
return_dict=True,
|
||
filter_trivials=False,
|
||
remove_extra_space=False
|
||
):
|
||
text = batch["sentence"].lower().strip()
|
||
|
||
# Parsivar normalizer
|
||
if is_normalize:
|
||
text = _normalizer.normalize(text)
|
||
|
||
# Dictionary mapping
|
||
text = multiple_replace(text, dictionary_mapping)
|
||
text = re.sub(" +", " ", text)
|
||
|
||
# Remove specials
|
||
text = remove_special_characters(text, chars_to_ignore)
|
||
text = re.sub(" +", " ", text)
|
||
|
||
# Replace connected آ
|
||
special, pointer = "آ", int("0")
|
||
for f in sorted(finder(special, text, True)):
|
||
index = f + pointer - 1
|
||
if len(text) >= index:
|
||
if text[index] not in silent_chars:
|
||
new_text, extra_pointer = substring_replace(
|
||
f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
|
||
# Replace connected ها
|
||
pointer = int("0")
|
||
special_list = [
|
||
# "ام", "ای", "است", "ایم", "اید", "اند",
|
||
"هایمان", "هایم", "هایت", "هایش",
|
||
"هایتان", "هایشان", "هام", "هات",
|
||
"هاتان", "هامون", "هامان", "هاش",
|
||
"هاتون", "هاشان", "هاشون",
|
||
"هایی", "های", "هاس", "ها"
|
||
]
|
||
for special in special_list:
|
||
pointer = 0
|
||
text = text
|
||
for f in sorted(finder(special, text, False)):
|
||
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
||
if len(text) >= (end + 1):
|
||
if len(text) == (end + 1):
|
||
new_text, extra_pointer = substring_replace(
|
||
f"{zwnj}{special}",
|
||
text,
|
||
start + 1,
|
||
end + 1,
|
||
stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
else:
|
||
if text[end + 1] == " ":
|
||
new_text, extra_pointer = substring_replace(
|
||
f"{zwnj}{special}",
|
||
text,
|
||
start + 1,
|
||
end + 1,
|
||
stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
|
||
special, pointer = "افزار", int("0")
|
||
for f in sorted(finder(special, text, False)):
|
||
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
||
|
||
if len(text) >= (end + 1):
|
||
new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
|
||
# Replace connected ها
|
||
pointer = int("0")
|
||
special_list = [
|
||
"ترین", "تر"
|
||
]
|
||
for special in special_list:
|
||
pointer = 0
|
||
text = text
|
||
for f in sorted(finder(special, text, False)):
|
||
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
||
if len(text) >= (end + 1):
|
||
if len(text) == (end + 1):
|
||
new_text, extra_pointer = substring_replace(
|
||
f"{zwnj}{special}",
|
||
text,
|
||
start + 1,
|
||
end + 1,
|
||
stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
else:
|
||
if text[end + 1] == " ":
|
||
new_text, extra_pointer = substring_replace(
|
||
f"{zwnj}{special}",
|
||
text,
|
||
start + 1,
|
||
end + 1,
|
||
stripped=True)
|
||
text = new_text
|
||
pointer += 1 + 1 - 1 - extra_pointer
|
||
|
||
# Normalizer at word level
|
||
text = normalizer_at_word_level(text)
|
||
text = re.sub(" +", " ", text)
|
||
|
||
if remove_extra_space:
|
||
text = text.strip()
|
||
else:
|
||
text = text.strip() + " "
|
||
|
||
if filter_trivials:
|
||
if not len(text) > 2:
|
||
text = None
|
||
|
||
if not return_dict:
|
||
return text
|
||
|
||
batch["sentence"] = text
|
||
return batch
|
||
|