File size: 7,176 Bytes
952467c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0f225e
952467c
 
 
 
e0f225e
 
952467c
 
 
 
 
 
 
 
e0f225e
952467c
e0f225e
952467c
 
 
 
 
 
 
 
 
 
 
 
e0f225e
952467c
e0f225e
952467c
 
 
e0f225e
 
 
952467c
 
e0f225e
 
 
 
 
 
 
 
952467c
e0f225e
952467c
e0f225e
952467c
e0f225e
 
 
 
 
 
 
 
 
2f42eba
e0f225e
 
 
 
 
 
952467c
 
e0f225e
 
 
 
 
 
 
 
952467c
 
 
e0f225e
952467c
 
e0f225e
 
 
 
 
952467c
 
e0f225e
952467c
e0f225e
 
952467c
e0f225e
952467c
e0f225e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
Text-to-speech audio generation for translated subtitles.
"""
import os
import time
import shutil
import tempfile
from pathlib import Path
from tqdm import tqdm
import subprocess

from gtts import gTTS
import pysrt

from src.utils.logger import get_logger
from src.audio.extractor import create_silent_audio
from config import OUTPUT_DIR, TTS_VOICES, MAX_RETRY_ATTEMPTS

logger = get_logger(__name__)

def generate_translated_audio(srt_path, target_lang, video_duration=180):
    """
    Generate translated audio using text-to-speech for each subtitle.
    
    Args:
        srt_path (str): Path to the SRT subtitle file
        target_lang (str): Target language code (e.g., 'en', 'es')
        video_duration (float): Duration of the original video in seconds
        
    Returns:
        Path: Path to the translated audio file
        
    Raises:
        Exception: If audio generation fails
    """
    try:
        srt_path = Path(srt_path)
        logger.info(f"Generating translated audio for {target_lang} from {srt_path}")
        
        # Load subtitles
        subs = pysrt.open(srt_path, encoding="utf-8")
        logger.info(f"Loaded {len(subs)} subtitles from SRT file")
        
        # Create temporary directory for audio chunks
        temp_dir = Path(tempfile.mkdtemp(prefix=f"audio_{target_lang}_", dir=OUTPUT_DIR / "temp"))
        logger.debug(f"Created temporary directory: {temp_dir}")
        
        # Generate TTS for each subtitle
        audio_files = []
        timings = []
        
        logger.info(f"Generating speech for {len(subs)} subtitles in {target_lang}")
        for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
            text = sub.text.strip()
            if not text:
                continue
                
            # Get timing information
            start_time = (sub.start.hours * 3600 + 
                         sub.start.minutes * 60 + 
                         sub.start.seconds + 
                         sub.start.milliseconds / 1000)
            
            end_time = (sub.end.hours * 3600 + 
                       sub.end.minutes * 60 + 
                       sub.end.seconds + 
                       sub.end.milliseconds / 1000)
            
            duration = end_time - start_time
            
            # Generate TTS audio
            tts_lang = TTS_VOICES.get(target_lang, target_lang)
            audio_file = temp_dir / f"chunk_{i:04d}.mp3"
            
            # Add a retry mechanism
            retry_count = 0
            while retry_count < MAX_RETRY_ATTEMPTS:
                try:
                    # For certain languages, use slower speed
                    slow_option = target_lang in ["hi", "ja", "zh-CN", "ar"] 
                    tts = gTTS(text=text, lang=target_lang, slow=slow_option)
                    tts.save(str(audio_file))
                    
                    logger.info(f"Generated TTS file size for chunk {i}: {audio_file.stat().st_size} bytes")
                    
                    if audio_file.exists() and audio_file.stat().st_size > 0:
                        break
                    else:
                        raise Exception("Generated audio file is empty")
                        
                except Exception as e:
                    retry_count += 1
                    logger.warning(f"TTS attempt {retry_count} failed for {target_lang}: {str(e)}")
                    time.sleep(1)
                    
                    # Fallback to shortened text
                    if retry_count == MAX_RETRY_ATTEMPTS - 1 and len(text) > 100:
                        logger.warning(f"Trying with shortened text for {target_lang}")
                        shortened_text = text[:100] + "..."
                        tts = gTTS(text=shortened_text, lang=target_lang, slow=True)
                        tts.save(str(audio_file))
            
            if audio_file.exists() and audio_file.stat().st_size > 0:
                audio_files.append(audio_file)
                timings.append((start_time, end_time, duration, audio_file))
            else:
                logger.warning(f"Failed to generate audio for subtitle {i}")
        
        # Fallback if no audio generated
        if not audio_files:
            logger.warning(f"No audio files generated for {target_lang}")
            silent_audio = OUTPUT_DIR / f"translated_audio_{target_lang}.wav"
            create_silent_audio(video_duration, silent_audio)
            return silent_audio

        # Output configuration
        output_audio = OUTPUT_DIR / f"translated_audio_{target_lang}.mp3"
        silence_file = temp_dir / "silence.wav"
        create_silent_audio(video_duration, silence_file)

        # Validate input files
        for f in [silence_file, *audio_files]:
            if not f.exists():
                logger.error(f"Missing input file: {f}")
                return create_silent_audio(video_duration, output_audio)

        # Build FFmpeg command with volume boost and timing
        cmd = ['ffmpeg', '-y']
        cmd += ['-i', str(silence_file)]
        
        # Add all audio chunks as inputs
        for audio_file in audio_files:
            cmd += ['-i', str(audio_file)]

        # Create filter chain for each audio chunk
        filter_chains = []
        for i, (start_time, _, _, _) in enumerate(timings):
            delay_ms = int(start_time * 1000)
            filter_chains.append(
                f"[{i+1}:a]volume=12dB,adelay={delay_ms}|{delay_ms},apad=whole_dur={video_duration}[a{i}]"
            )

        # Mix all audio streams with normalization
        mix_inputs = ''.join([f"[a{i}]" for i in range(len(timings))])
        filter_complex = ";".join(filter_chains) + \
                       f";{mix_inputs}amix=inputs={len(timings)}:duration=longest:normalize=0,volume=3dB[aout]"

        cmd += [
            '-filter_complex', filter_complex,
            '-map', '[aout]',
            '-c:a', 'libmp3lame',  # Changed to MP3 codec
            '-b:a', '192k',
            str(output_audio)
        ]

        logger.debug(f"Running FFmpeg command: {' '.join(cmd)}")

        # Execute audio mixing
        process = subprocess.run(cmd, capture_output=True, text=True)
        
        if process.returncode != 0:
            logger.error(f"Audio mixing failed: {process.stderr}")
            silent_audio = OUTPUT_DIR / f"translated_audio_{target_lang}.wav"
            create_silent_audio(video_duration, silent_audio)
            return silent_audio

        logger.info(f"Final audio file size: {output_audio.stat().st_size} bytes")

        # Cleanup temporary files
        try:
            shutil.rmtree(temp_dir)
            logger.debug(f"Cleaned temporary directory: {temp_dir}")
        except Exception as e:
            logger.warning(f"Temp cleanup failed: {str(e)}")

        return output_audio

    except Exception as e:
        logger.error(f"Audio generation failed: {str(e)}", exc_info=True)
        silent_audio = OUTPUT_DIR / f"translated_audio_{target_lang}.wav"
        create_silent_audio(video_duration, silent_audio)
        return silent_audio