Spaces:

jameszokah
/

jamiya

Running

File size: 5,485 Bytes

383520d

"""Prompt engineering for consistent voice generation."""
import re
import random
from typing import List, Dict, Optional
import logging

# Set up logging
logger = logging.getLogger(__name__)

# Voice style descriptors for consistent prompting
VOICE_STYLES = {
    "alloy": {
        "adjectives": ["balanced", "natural", "clear", "articulate", "neutral", "conversational"],
        "characteristics": ["medium pitch", "even pacing", "neutral tone", "balanced resonance"],
        "speaking_style": "conversational and balanced"
    },
    "echo": {
        "adjectives": ["resonant", "deep", "reverberant", "rich", "sonorous", "full"],
        "characteristics": ["lower pitch", "deliberate pacing", "resonant tone", "deeper timbre"],
        "speaking_style": "rich and resonant"
    },
    "fable": {
        "adjectives": ["bright", "light", "clear", "energetic", "articulate", "animated"],
        "characteristics": ["higher pitch", "lively pacing", "bright tone", "clear articulation"],
        "speaking_style": "bright and energetic"
    },
    "onyx": {
        "adjectives": ["deep", "authoritative", "powerful", "commanding", "strong", "resolute"],
        "characteristics": ["low pitch", "measured pacing", "authoritative tone", "strong projection"],
        "speaking_style": "deep and authoritative"
    },
    "nova": {
        "adjectives": ["warm", "pleasant", "smooth", "harmonious", "gentle", "comforting"],
        "characteristics": ["medium pitch", "smooth pacing", "warm tone", "pleasant timbre"],
        "speaking_style": "warm and smooth"
    },
    "shimmer": {
        "adjectives": ["light", "airy", "bright", "crystalline", "delicate", "expressive"],
        "characteristics": ["higher pitch", "quick pacing", "light tone", "bright timbre"],
        "speaking_style": "light and expressive"
    },
    "custom": {
        "adjectives": ["clear", "distinct", "authentic", "natural", "personalized", "unique"],
        "characteristics": ["natural rhythm", "authentic tone", "personal inflection", "distinctive sound"],
        "speaking_style": "authentic and natural"
    }
}

def initialize_templates():
    """Initialize prompt templates - placeholder for any future setup."""
    logger.info("Prompt templates initialized")
    return VOICE_STYLES

def split_into_segments(text: str, max_chars: int = 150) -> List[str]:
    """Split text into optimal segments for better generation.
    Args:
        text: Text to split
        max_chars: Maximum characters per segment
    Returns:
        List of text segments
    """
    # Handle empty or very short text
    if not text or len(text) <= max_chars:
        return [text]
    
    # Split by sentences first
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Initialize segments
    segments = []
    current_segment = ""
    
    for sentence in sentences:
        # If adding this sentence would exceed max_chars
        if len(current_segment) + len(sentence) > max_chars:
            # If current segment is not empty, add it to segments
            if current_segment:
                segments.append(current_segment.strip())
                current_segment = ""
                
            # If this sentence alone exceeds max_chars, split it by phrases
            if len(sentence) > max_chars:
                phrases = re.split(r'(?<=[,;:])\s+', sentence)
                for phrase in phrases:
                    if len(phrase) > max_chars:
                        # Split long phrases into chunks
                        words = phrase.split()
                        chunk = ""
                        for word in words:
                            if len(chunk) + len(word) + 1 <= max_chars:
                                chunk += " " + word if chunk else word
                            else:
                                segments.append(chunk.strip())
                                chunk = word
                        if chunk:
                            segments.append(chunk.strip())
                    else:
                        if len(current_segment) + len(phrase) <= max_chars:
                            current_segment += " " + phrase if current_segment else phrase
                        else:
                            segments.append(current_segment.strip())
                            current_segment = phrase
            else:
                current_segment = sentence
        else:
            current_segment += " " + sentence if current_segment else sentence
            
    # Add the last segment
    if current_segment:
        segments.append(current_segment.strip())
        
    logger.info(f"Split text into {len(segments)} segments")
    return segments

def format_text_for_voice(text: str, voice_name: str, segment_index: int = 0, total_segments: int = 1) -> str:
    """Format text with voice characteristics for more consistent generation.
    Args:
        text: Text to format
        voice_name: Name of the voice
        segment_index: Index of this segment (for multi-segment texts)
        total_segments: Total number of segments
    Returns:
        Formatted text optimized for consistent voice generation
    """
    # IMPORTANT: We no longer add voice instructions in brackets since CSM reads them aloud
    # Instead, we're using speaker IDs to control voice identity which is what the model expects
    
    # Just return the unmodified text - the Generator class will handle proper formatting
    return text