|
import re
|
|
|
|
import inflect
|
|
|
|
|
|
__INFLECT = inflect.engine()
|
|
__COMMA_NUMBER_PATTERN = re.compile(r"([0-9][0-9\,]+[0-9])")
|
|
__DECIMAL_NUMBER_PATTERN = re.compile(r"([0-9]+\.[0-9]+)")
|
|
__POUNDS_PATTERN = re.compile(r"Β£([0-9\,]*[0-9]+)")
|
|
__DOLLARS_PATTERN = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
|
__ORDINAL_PATTERN = re.compile(r"[0-9]+(st|nd|rd|th)")
|
|
__NUMBER_PATTERN = re.compile(r"[0-9]+")
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
text = __normalize_numbers(text)
|
|
text = replace_punctuation(text)
|
|
text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
|
|
return text
|
|
|
|
|
|
def replace_punctuation(text: str) -> str:
|
|
REPLACE_MAP = {
|
|
"οΌ": ",",
|
|
"οΌ": ",",
|
|
"οΌ": ",",
|
|
"γ": ".",
|
|
"οΌ": "!",
|
|
"οΌ": "?",
|
|
"\n": ".",
|
|
"οΌ": ".",
|
|
"β¦": "...",
|
|
"Β·Β·Β·": "...",
|
|
"γ»γ»γ»": "...",
|
|
"Β·": ",",
|
|
"γ»": ",",
|
|
"γ": ",",
|
|
"$": ".",
|
|
"β": "'",
|
|
"β": "'",
|
|
'"': "'",
|
|
"β": "'",
|
|
"β": "'",
|
|
"οΌ": "'",
|
|
"οΌ": "'",
|
|
"(": "'",
|
|
")": "'",
|
|
"γ": "'",
|
|
"γ": "'",
|
|
"γ": "'",
|
|
"γ": "'",
|
|
"[": "'",
|
|
"]": "'",
|
|
"β": "-",
|
|
"β": "-",
|
|
"ο½": "-",
|
|
"~": "-",
|
|
"γ": "'",
|
|
"γ": "'",
|
|
}
|
|
pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP))
|
|
replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return replaced_text
|
|
|
|
|
|
def __normalize_numbers(text: str) -> str:
|
|
text = re.sub(__COMMA_NUMBER_PATTERN, __remove_commas, text)
|
|
text = re.sub(__POUNDS_PATTERN, r"\1 pounds", text)
|
|
text = re.sub(__DOLLARS_PATTERN, __expand_dollars, text)
|
|
text = re.sub(__DECIMAL_NUMBER_PATTERN, __expand_decimal_point, text)
|
|
text = re.sub(__ORDINAL_PATTERN, __expand_ordinal, text)
|
|
text = re.sub(__NUMBER_PATTERN, __expand_number, text)
|
|
return text
|
|
|
|
|
|
def __expand_dollars(m: re.Match[str]) -> str:
|
|
match = m.group(1)
|
|
parts = match.split(".")
|
|
if len(parts) > 2:
|
|
return match + " dollars"
|
|
dollars = int(parts[0]) if parts[0] else 0
|
|
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
|
if dollars and cents:
|
|
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
|
cent_unit = "cent" if cents == 1 else "cents"
|
|
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
|
elif dollars:
|
|
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
|
return "%s %s" % (dollars, dollar_unit)
|
|
elif cents:
|
|
cent_unit = "cent" if cents == 1 else "cents"
|
|
return "%s %s" % (cents, cent_unit)
|
|
else:
|
|
return "zero dollars"
|
|
|
|
|
|
def __remove_commas(m: re.Match[str]) -> str:
|
|
return m.group(1).replace(",", "")
|
|
|
|
|
|
def __expand_ordinal(m: re.Match[str]) -> str:
|
|
return __INFLECT.number_to_words(m.group(0))
|
|
|
|
|
|
def __expand_number(m: re.Match[str]) -> str:
|
|
num = int(m.group(0))
|
|
if num > 1000 and num < 3000:
|
|
if num == 2000:
|
|
return "two thousand"
|
|
elif num > 2000 and num < 2010:
|
|
return "two thousand " + __INFLECT.number_to_words(num % 100)
|
|
elif num % 100 == 0:
|
|
return __INFLECT.number_to_words(num // 100) + " hundred"
|
|
else:
|
|
return __INFLECT.number_to_words(
|
|
num, andword="", zero="oh", group=2
|
|
).replace(
|
|
", ", " "
|
|
)
|
|
else:
|
|
return __INFLECT.number_to_words(num, andword="")
|
|
|
|
|
|
def __expand_decimal_point(m: re.Match[str]) -> str:
|
|
return m.group(1).replace(".", " point ")
|
|
|