Spaces:
Paused
Paused
""" | |
Text normalization and cleaning utilities for CSM-1B TTS system. | |
Handles common issues like contractions, numbers, and special characters. | |
""" | |
import re | |
import logging | |
logger = logging.getLogger(__name__) | |
class TextNormalizer: | |
"""Text normalization utilities for TTS.""" | |
# Common English contractions mapping | |
CONTRACTIONS = { | |
"don't": "dont", | |
"won't": "wont", | |
"can't": "cant", | |
"isn't": "isnt", | |
"he's": "hes", | |
"she's": "shes", | |
"they're": "theyre", | |
"we're": "were", | |
"you're": "youre", | |
"that's": "thats", | |
"it's": "its", | |
"what's": "whats", | |
"let's": "lets", | |
"who's": "whos", | |
"how's": "hows", | |
"where's": "wheres", | |
"there's": "theres", | |
"wouldn't": "wouldnt", | |
"shouldn't": "shouldnt", | |
"couldn't": "couldnt", | |
"hasn't": "hasnt", | |
"haven't": "havent", | |
"hadn't": "hadnt", | |
"didn't": "didnt", | |
"i'm": "im", | |
"i've": "ive", | |
"i'd": "id", | |
"i'll": "ill", | |
"you've": "youve", | |
"you'll": "youll", | |
"you'd": "youd", | |
"we've": "weve", | |
"we'll": "well", | |
"we'd": "wed", | |
"they've": "theyve", | |
"they'll": "theyll", | |
"they'd": "theyd", | |
"aren't": "arent", | |
"weren't": "werent", | |
"wasn't": "wasnt", | |
} | |
# Common abbreviations to expand | |
ABBREVIATIONS = { | |
"Mr.": "Mister", | |
"Mrs.": "Misses", | |
"Dr.": "Doctor", | |
"Prof.": "Professor", | |
"St.": "Street", | |
"Rd.": "Road", | |
"Ave.": "Avenue", | |
"vs.": "versus", | |
"etc.": "etcetera", | |
"e.g.": "for example", | |
"i.e.": "that is", | |
"approx.": "approximately", | |
} | |
# Simple number words for common numbers | |
NUMBER_WORDS = { | |
"0": "zero", | |
"1": "one", | |
"2": "two", | |
"3": "three", | |
"4": "four", | |
"5": "five", | |
"6": "six", | |
"7": "seven", | |
"8": "eight", | |
"9": "nine", | |
"10": "ten", | |
"11": "eleven", | |
"12": "twelve", | |
"13": "thirteen", | |
"14": "fourteen", | |
"15": "fifteen", | |
"16": "sixteen", | |
"17": "seventeen", | |
"18": "eighteen", | |
"19": "nineteen", | |
"20": "twenty", | |
"30": "thirty", | |
"40": "forty", | |
"50": "fifty", | |
"60": "sixty", | |
"70": "seventy", | |
"80": "eighty", | |
"90": "ninety", | |
"100": "one hundred", | |
"1000": "one thousand", | |
"1000000": "one million", | |
"1000000000": "one billion", | |
} | |
def normalize_text(cls, text: str) -> str: | |
""" | |
Normalize text for TTS: handle contractions, punctuation, and special cases. | |
Args: | |
text: Input text to normalize | |
Returns: | |
Normalized text ready for TTS | |
""" | |
if not text: | |
return text | |
# Log original text for debugging | |
logger.debug(f"Normalizing text: '{text}'") | |
# Remove voice instructions in square brackets | |
text = re.sub(r'\[.*?\]', '', text) | |
# Handle contractions - preserving case sensitivity | |
for contraction, replacement in cls.CONTRACTIONS.items(): | |
# Case insensitive replacement | |
text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE) | |
# Expand common abbreviations | |
for abbr, expanded in cls.ABBREVIATIONS.items(): | |
text = text.replace(abbr, expanded) | |
# Handle numbers - only convert standalone numbers | |
def replace_number(match): | |
number = match.group(0) | |
if number in cls.NUMBER_WORDS: | |
return cls.NUMBER_WORDS[number] | |
return number | |
text = re.sub(r'\b\d+\b', replace_number, text) | |
# Replace problematic symbols | |
text = text.replace("&", " and ") | |
text = text.replace("%", " percent ") | |
text = text.replace("@", " at ") | |
text = text.replace("#", " number ") | |
text = text.replace("$", " dollar ") | |
text = text.replace("€", " euro ") | |
text = text.replace("£", " pound ") | |
text = text.replace("¥", " yen ") | |
# Handle dates in MM/DD/YYYY format | |
text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text) | |
# Fix excessive spaces | |
text = re.sub(r'\s+', ' ', text).strip() | |
# Ensure sentence ends with punctuation | |
if not text[-1] in ['.', '!', '?', ';', ':', ',']: | |
text = text + '.' | |
logger.debug(f"Normalized text: '{text}'") | |
return text | |
def split_into_sentences(cls, text: str) -> list: | |
""" | |
Split text into sentences for better TTS performance. | |
Args: | |
text: Input text to split | |
Returns: | |
List of sentences | |
""" | |
# Normalize first | |
text = cls.normalize_text(text) | |
# Split on sentence boundaries | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
# Remove empty sentences | |
sentences = [s for s in sentences if s.strip()] | |
return sentences | |
def clean_text_for_tts(text: str) -> str: | |
"""Clean and normalize text for TTS processing.""" | |
return TextNormalizer.normalize_text(text) |