Spaces:

buchi-stdesign
/

SBV2-Editor-demo2

Running

App Files Files Community

SBV2-Editor-demo2 / style_bert_vits2 /nlp /english /normalizer.py

buchi-stdesign

Upload 221 files

08e6e45 verified 7 months ago

raw

history blame contribute delete

4.09 kB

	import re

	import inflect


	__INFLECT = inflect.engine()
	__COMMA_NUMBER_PATTERN = re.compile(r"([0-9][0-9\,]+[0-9])")
	__DECIMAL_NUMBER_PATTERN = re.compile(r"([0-9]+\.[0-9]+)")
	__POUNDS_PATTERN = re.compile(r"£([0-9\,]*[0-9]+)")
	__DOLLARS_PATTERN = re.compile(r"\$([0-9\.\,]*[0-9]+)")
	__ORDINAL_PATTERN = re.compile(r"[0-9]+(st\|nd\|rd\|th)")
	__NUMBER_PATTERN = re.compile(r"[0-9]+")


	def normalize_text(text: str) -> str:
	text = __normalize_numbers(text)
	text = replace_punctuation(text)
	text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
	return text


	def replace_punctuation(text: str) -> str:
	REPLACE_MAP = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"．": ".",
	"…": "...",
	"···": "...",
	"・・・": "...",
	"·": ",",
	"・": ",",
	"、": ",",
	"$": ".",
	"“": "'",
	"”": "'",
	'"': "'",
	"‘": "'",
	"’": "'",
	"（": "'",
	"）": "'",
	"(": "'",
	")": "'",
	"《": "'",
	"》": "'",
	"【": "'",
	"】": "'",
	"[": "'",
	"]": "'",
	"—": "-",
	"−": "-",
	"～": "-",
	"~": "-",
	"「": "'",
	"」": "'",
	}
	pattern = re.compile("\|".join(re.escape(p) for p in REPLACE_MAP))
	replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
	# replaced_text = re.sub(
	# r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
	# + "".join(punctuation)
	# + r"]+",
	# "",
	# replaced_text,
	# )
	return replaced_text


	def __normalize_numbers(text: str) -> str:
	text = re.sub(__COMMA_NUMBER_PATTERN, __remove_commas, text)
	text = re.sub(__POUNDS_PATTERN, r"\1 pounds", text)
	text = re.sub(__DOLLARS_PATTERN, __expand_dollars, text)
	text = re.sub(__DECIMAL_NUMBER_PATTERN, __expand_decimal_point, text)
	text = re.sub(__ORDINAL_PATTERN, __expand_ordinal, text)
	text = re.sub(__NUMBER_PATTERN, __expand_number, text)
	return text


	def __expand_dollars(m: re.Match[str]) -> str:
	match = m.group(1)
	parts = match.split(".")
	if len(parts) > 2:
	return match + " dollars" # Unexpected format
	dollars = int(parts[0]) if parts[0] else 0
	cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
	if dollars and cents:
	dollar_unit = "dollar" if dollars == 1 else "dollars"
	cent_unit = "cent" if cents == 1 else "cents"
	return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
	elif dollars:
	dollar_unit = "dollar" if dollars == 1 else "dollars"
	return "%s %s" % (dollars, dollar_unit)
	elif cents:
	cent_unit = "cent" if cents == 1 else "cents"
	return "%s %s" % (cents, cent_unit)
	else:
	return "zero dollars"


	def __remove_commas(m: re.Match[str]) -> str:
	return m.group(1).replace(",", "")


	def __expand_ordinal(m: re.Match[str]) -> str:
	return __INFLECT.number_to_words(m.group(0)) # type: ignore


	def __expand_number(m: re.Match[str]) -> str:
	num = int(m.group(0))
	if num > 1000 and num < 3000:
	if num == 2000:
	return "two thousand"
	elif num > 2000 and num < 2010:
	return "two thousand " + __INFLECT.number_to_words(num % 100) # type: ignore
	elif num % 100 == 0:
	return __INFLECT.number_to_words(num // 100) + " hundred" # type: ignore
	else:
	return __INFLECT.number_to_words(
	num, andword="", zero="oh", group=2 # type: ignore
	).replace(
	", ", " "
	) # type: ignore
	else:
	return __INFLECT.number_to_words(num, andword="") # type: ignore


	def __expand_decimal_point(m: re.Match[str]) -> str:
	return m.group(1).replace(".", " point ")