""" Text normalization and cleaning utilities for CSM-1B TTS system. Handles common issues like contractions, numbers, and special characters. """ import re import logging logger = logging.getLogger(__name__) class TextNormalizer: """Text normalization utilities for TTS.""" # Common English contractions mapping CONTRACTIONS = { "don't": "dont", "won't": "wont", "can't": "cant", "isn't": "isnt", "he's": "hes", "she's": "shes", "they're": "theyre", "we're": "were", "you're": "youre", "that's": "thats", "it's": "its", "what's": "whats", "let's": "lets", "who's": "whos", "how's": "hows", "where's": "wheres", "there's": "theres", "wouldn't": "wouldnt", "shouldn't": "shouldnt", "couldn't": "couldnt", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "didn't": "didnt", "i'm": "im", "i've": "ive", "i'd": "id", "i'll": "ill", "you've": "youve", "you'll": "youll", "you'd": "youd", "we've": "weve", "we'll": "well", "we'd": "wed", "they've": "theyve", "they'll": "theyll", "they'd": "theyd", "aren't": "arent", "weren't": "werent", "wasn't": "wasnt", } # Common abbreviations to expand ABBREVIATIONS = { "Mr.": "Mister", "Mrs.": "Misses", "Dr.": "Doctor", "Prof.": "Professor", "St.": "Street", "Rd.": "Road", "Ave.": "Avenue", "vs.": "versus", "etc.": "etcetera", "e.g.": "for example", "i.e.": "that is", "approx.": "approximately", } # Simple number words for common numbers NUMBER_WORDS = { "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty", "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy", "80": "eighty", "90": "ninety", "100": "one hundred", "1000": "one thousand", "1000000": "one million", "1000000000": "one billion", } @classmethod def normalize_text(cls, text: str) -> str: """ Normalize text for TTS: handle contractions, punctuation, and special cases. Args: text: Input text to normalize Returns: Normalized text ready for TTS """ if not text: return text # Log original text for debugging logger.debug(f"Normalizing text: '{text}'") # Remove voice instructions in square brackets text = re.sub(r'\[.*?\]', '', text) # Handle contractions - preserving case sensitivity for contraction, replacement in cls.CONTRACTIONS.items(): # Case insensitive replacement text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE) # Expand common abbreviations for abbr, expanded in cls.ABBREVIATIONS.items(): text = text.replace(abbr, expanded) # Handle numbers - only convert standalone numbers def replace_number(match): number = match.group(0) if number in cls.NUMBER_WORDS: return cls.NUMBER_WORDS[number] return number text = re.sub(r'\b\d+\b', replace_number, text) # Replace problematic symbols text = text.replace("&", " and ") text = text.replace("%", " percent ") text = text.replace("@", " at ") text = text.replace("#", " number ") text = text.replace("$", " dollar ") text = text.replace("€", " euro ") text = text.replace("£", " pound ") text = text.replace("¥", " yen ") # Handle dates in MM/DD/YYYY format text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text) # Fix excessive spaces text = re.sub(r'\s+', ' ', text).strip() # Ensure sentence ends with punctuation if not text[-1] in ['.', '!', '?', ';', ':', ',']: text = text + '.' logger.debug(f"Normalized text: '{text}'") return text @classmethod def split_into_sentences(cls, text: str) -> list: """ Split text into sentences for better TTS performance. Args: text: Input text to split Returns: List of sentences """ # Normalize first text = cls.normalize_text(text) # Split on sentence boundaries sentences = re.split(r'(?<=[.!?])\s+', text) # Remove empty sentences sentences = [s for s in sentences if s.strip()] return sentences def clean_text_for_tts(text: str) -> str: """Clean and normalize text for TTS processing.""" return TextNormalizer.normalize_text(text)