Spaces:
Paused
Paused
File size: 5,574 Bytes
01115c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""
Text normalization and cleaning utilities for CSM-1B TTS system.
Handles common issues like contractions, numbers, and special characters.
"""
import re
import logging
logger = logging.getLogger(__name__)
class TextNormalizer:
"""Text normalization utilities for TTS."""
# Common English contractions mapping
CONTRACTIONS = {
"don't": "dont",
"won't": "wont",
"can't": "cant",
"isn't": "isnt",
"he's": "hes",
"she's": "shes",
"they're": "theyre",
"we're": "were",
"you're": "youre",
"that's": "thats",
"it's": "its",
"what's": "whats",
"let's": "lets",
"who's": "whos",
"how's": "hows",
"where's": "wheres",
"there's": "theres",
"wouldn't": "wouldnt",
"shouldn't": "shouldnt",
"couldn't": "couldnt",
"hasn't": "hasnt",
"haven't": "havent",
"hadn't": "hadnt",
"didn't": "didnt",
"i'm": "im",
"i've": "ive",
"i'd": "id",
"i'll": "ill",
"you've": "youve",
"you'll": "youll",
"you'd": "youd",
"we've": "weve",
"we'll": "well",
"we'd": "wed",
"they've": "theyve",
"they'll": "theyll",
"they'd": "theyd",
"aren't": "arent",
"weren't": "werent",
"wasn't": "wasnt",
}
# Common abbreviations to expand
ABBREVIATIONS = {
"Mr.": "Mister",
"Mrs.": "Misses",
"Dr.": "Doctor",
"Prof.": "Professor",
"St.": "Street",
"Rd.": "Road",
"Ave.": "Avenue",
"vs.": "versus",
"etc.": "etcetera",
"e.g.": "for example",
"i.e.": "that is",
"approx.": "approximately",
}
# Simple number words for common numbers
NUMBER_WORDS = {
"0": "zero",
"1": "one",
"2": "two",
"3": "three",
"4": "four",
"5": "five",
"6": "six",
"7": "seven",
"8": "eight",
"9": "nine",
"10": "ten",
"11": "eleven",
"12": "twelve",
"13": "thirteen",
"14": "fourteen",
"15": "fifteen",
"16": "sixteen",
"17": "seventeen",
"18": "eighteen",
"19": "nineteen",
"20": "twenty",
"30": "thirty",
"40": "forty",
"50": "fifty",
"60": "sixty",
"70": "seventy",
"80": "eighty",
"90": "ninety",
"100": "one hundred",
"1000": "one thousand",
"1000000": "one million",
"1000000000": "one billion",
}
@classmethod
def normalize_text(cls, text: str) -> str:
"""
Normalize text for TTS: handle contractions, punctuation, and special cases.
Args:
text: Input text to normalize
Returns:
Normalized text ready for TTS
"""
if not text:
return text
# Log original text for debugging
logger.debug(f"Normalizing text: '{text}'")
# Remove voice instructions in square brackets
text = re.sub(r'\[.*?\]', '', text)
# Handle contractions - preserving case sensitivity
for contraction, replacement in cls.CONTRACTIONS.items():
# Case insensitive replacement
text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE)
# Expand common abbreviations
for abbr, expanded in cls.ABBREVIATIONS.items():
text = text.replace(abbr, expanded)
# Handle numbers - only convert standalone numbers
def replace_number(match):
number = match.group(0)
if number in cls.NUMBER_WORDS:
return cls.NUMBER_WORDS[number]
return number
text = re.sub(r'\b\d+\b', replace_number, text)
# Replace problematic symbols
text = text.replace("&", " and ")
text = text.replace("%", " percent ")
text = text.replace("@", " at ")
text = text.replace("#", " number ")
text = text.replace("$", " dollar ")
text = text.replace("€", " euro ")
text = text.replace("£", " pound ")
text = text.replace("¥", " yen ")
# Handle dates in MM/DD/YYYY format
text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text)
# Fix excessive spaces
text = re.sub(r'\s+', ' ', text).strip()
# Ensure sentence ends with punctuation
if not text[-1] in ['.', '!', '?', ';', ':', ',']:
text = text + '.'
logger.debug(f"Normalized text: '{text}'")
return text
@classmethod
def split_into_sentences(cls, text: str) -> list:
"""
Split text into sentences for better TTS performance.
Args:
text: Input text to split
Returns:
List of sentences
"""
# Normalize first
text = cls.normalize_text(text)
# Split on sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
# Remove empty sentences
sentences = [s for s in sentences if s.strip()]
return sentences
def clean_text_for_tts(text: str) -> str:
"""Clean and normalize text for TTS processing."""
return TextNormalizer.normalize_text(text) |