File size: 5,485 Bytes
383520d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Prompt engineering for consistent voice generation."""
import re
import random
from typing import List, Dict, Optional
import logging

# Set up logging
logger = logging.getLogger(__name__)

# Voice style descriptors for consistent prompting
VOICE_STYLES = {
    "alloy": {
        "adjectives": ["balanced", "natural", "clear", "articulate", "neutral", "conversational"],
        "characteristics": ["medium pitch", "even pacing", "neutral tone", "balanced resonance"],
        "speaking_style": "conversational and balanced"
    },
    "echo": {
        "adjectives": ["resonant", "deep", "reverberant", "rich", "sonorous", "full"],
        "characteristics": ["lower pitch", "deliberate pacing", "resonant tone", "deeper timbre"],
        "speaking_style": "rich and resonant"
    },
    "fable": {
        "adjectives": ["bright", "light", "clear", "energetic", "articulate", "animated"],
        "characteristics": ["higher pitch", "lively pacing", "bright tone", "clear articulation"],
        "speaking_style": "bright and energetic"
    },
    "onyx": {
        "adjectives": ["deep", "authoritative", "powerful", "commanding", "strong", "resolute"],
        "characteristics": ["low pitch", "measured pacing", "authoritative tone", "strong projection"],
        "speaking_style": "deep and authoritative"
    },
    "nova": {
        "adjectives": ["warm", "pleasant", "smooth", "harmonious", "gentle", "comforting"],
        "characteristics": ["medium pitch", "smooth pacing", "warm tone", "pleasant timbre"],
        "speaking_style": "warm and smooth"
    },
    "shimmer": {
        "adjectives": ["light", "airy", "bright", "crystalline", "delicate", "expressive"],
        "characteristics": ["higher pitch", "quick pacing", "light tone", "bright timbre"],
        "speaking_style": "light and expressive"
    },
    "custom": {
        "adjectives": ["clear", "distinct", "authentic", "natural", "personalized", "unique"],
        "characteristics": ["natural rhythm", "authentic tone", "personal inflection", "distinctive sound"],
        "speaking_style": "authentic and natural"
    }
}

def initialize_templates():
    """Initialize prompt templates - placeholder for any future setup."""
    logger.info("Prompt templates initialized")
    return VOICE_STYLES

def split_into_segments(text: str, max_chars: int = 150) -> List[str]:
    """Split text into optimal segments for better generation.
    Args:
        text: Text to split
        max_chars: Maximum characters per segment
    Returns:
        List of text segments
    """
    # Handle empty or very short text
    if not text or len(text) <= max_chars:
        return [text]
    
    # Split by sentences first
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Initialize segments
    segments = []
    current_segment = ""
    
    for sentence in sentences:
        # If adding this sentence would exceed max_chars
        if len(current_segment) + len(sentence) > max_chars:
            # If current segment is not empty, add it to segments
            if current_segment:
                segments.append(current_segment.strip())
                current_segment = ""
                
            # If this sentence alone exceeds max_chars, split it by phrases
            if len(sentence) > max_chars:
                phrases = re.split(r'(?<=[,;:])\s+', sentence)
                for phrase in phrases:
                    if len(phrase) > max_chars:
                        # Split long phrases into chunks
                        words = phrase.split()
                        chunk = ""
                        for word in words:
                            if len(chunk) + len(word) + 1 <= max_chars:
                                chunk += " " + word if chunk else word
                            else:
                                segments.append(chunk.strip())
                                chunk = word
                        if chunk:
                            segments.append(chunk.strip())
                    else:
                        if len(current_segment) + len(phrase) <= max_chars:
                            current_segment += " " + phrase if current_segment else phrase
                        else:
                            segments.append(current_segment.strip())
                            current_segment = phrase
            else:
                current_segment = sentence
        else:
            current_segment += " " + sentence if current_segment else sentence
            
    # Add the last segment
    if current_segment:
        segments.append(current_segment.strip())
        
    logger.info(f"Split text into {len(segments)} segments")
    return segments

def format_text_for_voice(text: str, voice_name: str, segment_index: int = 0, total_segments: int = 1) -> str:
    """Format text with voice characteristics for more consistent generation.
    Args:
        text: Text to format
        voice_name: Name of the voice
        segment_index: Index of this segment (for multi-segment texts)
        total_segments: Total number of segments
    Returns:
        Formatted text optimized for consistent voice generation
    """
    # IMPORTANT: We no longer add voice instructions in brackets since CSM reads them aloud
    # Instead, we're using speaker IDs to control voice identity which is what the model expects
    
    # Just return the unmodified text - the Generator class will handle proper formatting
    return text