testdeep123 commited on
Commit
c25723f
Β·
verified Β·
1 Parent(s): fb20f92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -193
app.py CHANGED
@@ -1,210 +1,191 @@
1
  import os
2
- from moviepy.editor import VideoFileClip, AudioFileClip, ImageClip, CompositeVideoClip
3
- import tempfile
4
  import random
 
5
  import shutil
6
- from gtts import gTTS
7
- from PIL import Image, ImageDraw, ImageFont
8
  import numpy as np
9
- import textwrap
 
 
 
10
  import gradio as gr
 
 
 
 
11
 
12
- # Global Configuration
13
- OUTPUT_VIDEO_FILENAME = "final_video.mp4"
14
- TARGET_RESOLUTION = (1080, 1920) # Vertical video resolution
15
- TEMP_FOLDER = None
16
- font_size = 45
17
- fps = 30
18
- preset = "veryfast"
19
- bg_music_volume = 0.08
20
-
21
- def generate_dummy_script():
22
- """Generate a dummy script that results in approximately 64 seconds of narration."""
23
- return """
24
- [Intro]
25
- The world is full of natural wonders.
26
- [Forests]
27
- Forests cover vast regions of the planet.
28
- [Rivers]
29
- Rivers flow through landscapes shaping the earth.
30
- [Mountains]
31
- Mountains stand tall against the sky above.
32
- [Oceans]
33
- Oceans hold mysteries beneath their waves.
34
- [Wildlife]
35
- Wildlife thrives in diverse habitats worldwide.
36
- [Conclusion]
37
- Nature continues to inspire us all.
38
- """
39
-
40
- def parse_script(script_text):
41
- """Parse the script to extract full narration text."""
42
- sections = {}
43
- current_title = None
44
- current_text = ""
 
 
 
 
 
 
 
 
45
  for line in script_text.splitlines():
46
- line = line.strip()
47
- if line.startswith("[") and "]" in line:
48
- bracket_start = line.find("[")
49
- bracket_end = line.find("]", bracket_start)
50
- if bracket_start != -1 and bracket_end != -1:
51
- if current_title is not None:
52
- sections[current_title] = current_text.strip()
53
- current_title = line[bracket_start+1:bracket_end]
54
- current_text = line[bracket_end+1:].strip()
55
- elif current_title:
56
- current_text += line + " "
57
- if current_title:
58
- sections[current_title] = current_text.strip()
59
- full_narration = " ".join(sections.values())
60
- return full_narration
61
-
62
- def generate_tts(text):
63
- """Generate TTS audio for the full narration."""
64
- safe_text = "narration"
65
- file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
66
- try:
67
- tts = gTTS(text=text, lang='en')
68
- mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
69
- tts.save(mp3_path)
70
- from pydub import AudioSegment
71
- audio = AudioSegment.from_mp3(mp3_path)
72
- audio.export(file_path, format="wav")
73
- os.remove(mp3_path)
74
- print(f"TTS audio saved to {file_path}")
75
- return file_path
76
- except Exception as e:
77
- print(f"TTS generation failed: {e}")
78
- return None
79
-
80
- def get_audio_duration(audio_path):
81
- """Get the duration of the audio file."""
82
- audio = AudioFileClip(audio_path)
83
- duration = audio.duration
84
- audio.close()
85
- return duration
86
-
87
- def generate_subtitle_image(text, font_path="arial.ttf", font_size=45, text_color=(255, 255, 255, 255), bg_color=(0, 0, 0, 64), size=(1080, 200)):
88
- """Generate a subtitle image with wrapped text."""
89
- img = Image.new('RGBA', size, (0, 0, 0, 0)) # Transparent background
90
- draw = ImageDraw.Draw(img)
91
- if bg_color:
92
- draw.rectangle([(0, 0), size], fill=bg_color)
93
  try:
94
- font = ImageFont.truetype(font_path, font_size)
95
- except IOError:
96
- font = ImageFont.load_default()
97
- lines = textwrap.wrap(text, width=40)
98
- line_height = font.getsize('hg')[1]
99
- total_height = line_height * len(lines)
100
- y_start = (size[1] - total_height) / 2
101
- for i, line in enumerate(lines):
102
- text_width, _ = draw.textsize(line, font=font)
103
- x = (size[0] - text_width) / 2
104
- y = y_start + i * line_height
105
- draw.text((x, y), line, font=font, fill=text_color)
106
- return np.array(img)
107
-
108
- def add_background_music(video_clip):
109
- """Add background music to the video if available."""
110
- bg_music_path = "music.mp3"
111
- if os.path.exists(bg_music_path):
112
- bg_music = AudioFileClip(bg_music_path)
113
- if bg_music.duration < video_clip.duration:
114
- from moviepy.audio.AudioClip import concatenate_audioclips
115
- loops_needed = int(video_clip.duration / bg_music.duration) + 1
116
- bg_music = concatenate_audioclips([bg_music] * loops_needed)
117
- bg_music = bg_music.subclip(0, video_clip.duration)
118
- bg_music = bg_music.volumex(bg_music_volume)
119
- video_audio = video_clip.audio
120
- from moviepy.audio.AudioClip import CompositeAudioClip
121
- mixed_audio = CompositeAudioClip([video_audio, bg_music])
122
- video_clip = video_clip.set_audio(mixed_audio)
123
- return video_clip
124
-
125
- def generate_video(user_input, include_captions):
126
- """Generate a video with voiceover and optional captions."""
127
- global TEMP_FOLDER
128
- TEMP_FOLDER = tempfile.mkdtemp()
129
-
130
- # Generate and parse script
131
- script = generate_dummy_script() # Using dummy script for 64s duration
132
- full_narration = parse_script(script)
133
- print("Full Narration:", full_narration)
134
-
135
- # Generate voiceover
136
- tts_path = generate_tts(full_narration)
137
- if not tts_path:
138
- shutil.rmtree(TEMP_FOLDER)
139
- return None
140
-
141
- # Assuming voiceover is 64 seconds as per requirement
142
- audio_duration = 64 # Hardcoded for this example
143
- video_duration = audio_duration + 0.5 # 64.5 seconds clip
144
-
145
- # Cut video clip from 13-minute video
146
- video_path = "video.mp4"
147
- if not os.path.exists(video_path):
148
- print("video.mp4 not found.")
149
- shutil.rmtree(TEMP_FOLDER)
150
- return None
151
-
152
- long_video = VideoFileClip(video_path)
153
- total_duration = long_video.duration # 13 minutes = 780 seconds
154
- if total_duration < video_duration:
155
- print("Video is too short.")
156
- long_video.close()
157
- shutil.rmtree(TEMP_FOLDER)
158
- return None
159
-
160
- start_time = random.uniform(0, total_duration - video_duration)
161
- video_clip = long_video.subclip(start_time, start_time + video_duration)
162
- long_video.close()
163
-
164
- # Set voiceover audio
165
- video_clip = video_clip.set_audio(AudioFileClip(tts_path))
166
-
167
- # Add captions if requested
168
- if include_captions == "Yes":
169
- words = full_narration.split()
170
- num_words = len(words)
171
- word_duration = audio_duration / num_words # Timing based on audio_duration
172
- chunks = [words[i:i+5] for i in range(0, num_words, 5)]
173
- subtitle_clips = []
174
  for i, chunk in enumerate(chunks):
175
- chunk_text = ' '.join(chunk)
176
- start_idx = i * 5
177
- end_idx = start_idx + len(chunk) - 1
178
- start_time_chunk = start_idx * word_duration
179
- end_time_chunk = min((end_idx + 1) * word_duration, audio_duration)
180
- subtitle_img = generate_subtitle_image(chunk_text, font_size=font_size)
181
- txt_clip = ImageClip(subtitle_img).set_start(start_time_chunk).set_duration(end_time_chunk - start_time_chunk)
182
- txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] - 200))
183
- subtitle_clips.append(txt_clip)
184
- video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
185
-
186
- # Add background music
187
- video_clip = add_background_music(video_clip)
188
-
189
- # Export video
190
- video_clip.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
191
- print(f"Video saved as {OUTPUT_VIDEO_FILENAME}")
192
-
193
- # Cleanup
194
- shutil.rmtree(TEMP_FOLDER)
195
- return OUTPUT_VIDEO_FILENAME
196
-
197
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  iface = gr.Interface(
199
  fn=generate_video,
200
  inputs=[
201
- gr.Textbox(label="Video Concept", placeholder="Enter concept (ignored for this example)"),
202
- gr.Radio(["Yes", "No"], label="Include Captions", value="No")
 
 
203
  ],
204
  outputs=gr.Video(label="Generated Video"),
205
- title="Video Generator",
206
- description="Generates a 64.5s video clip with a 64s voiceover from a 13min video."
207
  )
208
 
209
  if __name__ == "__main__":
210
- iface.launch()
 
1
  import os
2
+ import re
3
+ import math
4
  import random
5
+ import tempfile
6
  import shutil
7
+ import requests
 
8
  import numpy as np
9
+ from kokoro import KPipeline
10
+ import soundfile as sf
11
+ from pydub import AudioSegment
12
+ from gtts import gTTS
13
  import gradio as gr
14
+ from moviepy.editor import (
15
+ VideoFileClip, AudioFileClip, concatenate_audioclips,
16
+ CompositeAudioClip, CompositeVideoClip, TextClip
17
+ )
18
 
19
+ # ────────── GLOBAL CONFIG ──────────
20
+ OPENROUTER_API_KEY = 'sk-or-v1-…'
21
+ OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
22
+ SOURCE_VIDEO_PATH = "video.mp4" # 13-min source
23
+ OUTPUT_VIDEO_PATH = "final_video.mp4"
24
+ TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style
25
+ VOICE_SPEED = 0.9
26
+ CAPTION_FONT_SIZE = 45
27
+ BG_MUSIC_VOLUME = 0.08
28
+
29
+ # Initialize Kokoro TTS
30
+ pipeline = KPipeline(lang_code='a') # American English
31
+
32
+ # ────────── SCRIPT GENERATION ──────────
33
+ def generate_script(topic: str) -> str:
34
+ headers = {
35
+ 'Authorization': f'Bearer {OPENROUTER_API_KEY}',
36
+ 'HTTP-Referer': 'https://your-domain.com',
37
+ 'X-Title': 'AI Documentary Maker'
38
+ }
39
+ prompt = f"""You’re a professional documentary narrator.
40
+ Break your script into scenes with [Tags], one sentence each (≀12 words).
41
+ No slang or numbers. At the end, include [Subscribe] with a formal reason.
42
+
43
+ Topic: {topic}
44
+ """
45
+ payload = {
46
+ 'model': OPENROUTER_MODEL,
47
+ 'messages': [{'role':'user','content':prompt}],
48
+ 'temperature':0.4,
49
+ 'max_tokens':5000
50
+ }
51
+ r = requests.post('https://openrouter.ai/api/v1/chat/completions',
52
+ headers=headers, json=payload, timeout=30)
53
+ r.raise_for_status()
54
+ return r.json()['choices'][0]['message']['content']
55
+
56
+ def parse_script(script_text: str):
57
+ """Return list of (scene_title, sentence_text)."""
58
+ sections = []
59
+ current = None
60
  for line in script_text.splitlines():
61
+ m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
62
+ if m:
63
+ if current:
64
+ sections.append(current)
65
+ current = [m.group(1), m.group(2)]
66
+ elif current and line.strip():
67
+ current[1] += ' ' + line.strip()
68
+ if current:
69
+ sections.append(current)
70
+ return sections
71
+
72
+ # ────────── TTS ──────────
73
+ def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
74
+ """Produce a WAV file, using Kokoro then gTTS fallback."""
75
+ safe = re.sub(r'[^\w]', '_', text[:10]).strip()
76
+ out_path = os.path.join(dirpath, f"tts_{safe}.wav")
77
+ if os.path.exists(out_path):
78
+ return out_path
79
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  try:
81
+ # Kokoro pipeline returns a sequence of numpy audio arrays
82
+ segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
83
+ arrays = [seg_audio for _, _, seg_audio in segments]
84
+ combined = np.concatenate(arrays, axis=0)
85
+ sf.write(out_path, combined, 24000)
86
+ except Exception:
87
+ # fallback to gTTS
88
+ mp3 = os.path.join(dirpath, f"{safe}.mp3")
89
+ gTTS(text=text, lang='en').save(mp3)
90
+ wav = AudioSegment.from_mp3(mp3)
91
+ wav.export(out_path, format="wav")
92
+ os.remove(mp3)
93
+ return out_path
94
+
95
+ # ────────── VIDEO + SUBTITLES ──────────
96
+ def add_pillow_subtitles(clip, sections):
97
+ """Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
98
+ subtitles = []
99
+ cum_time = 0
100
+ for title, sentence in sections:
101
+ audio_path = clip.audio # duration only
102
+ # split sentence into 5-word chunks
103
+ words = sentence.split()
104
+ chunks = [words[i:i+5] for i in range(0, len(words), 5)]
105
+ seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
106
+ # approximate each chunk duration
107
+ chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  for i, chunk in enumerate(chunks):
109
+ txt = ' '.join(chunk)
110
+ txt_clip = (
111
+ TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
112
+ color='white', bg_color='rgba(0,0,0,0.3)',
113
+ size=(TARGET_RESOLUTION[0]*0.9, None),
114
+ method='pillow')
115
+ .set_start(cum_time + i*chunk_dur)
116
+ .set_duration(chunk_dur)
117
+ .set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
118
+ )
119
+ subtitles.append(txt_clip)
120
+ cum_time += seg_dur
121
+ return subtitles
122
+
123
+ def generate_video(topic, include_captions, music_file, voice_choice):
124
+ if not os.path.exists(SOURCE_VIDEO_PATH):
125
+ raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
126
+ # 1) get script & sections
127
+ script = generate_script(topic)
128
+ sections = parse_script(script)
129
+ # 2) TTS each sentence
130
+ tmp = tempfile.mkdtemp()
131
+ tts_paths = [
132
+ generate_tts_audio(sentence, voice_choice, tmp)
133
+ for _, sentence in sections
134
+ ]
135
+ # 3) concatenate all TTS audios
136
+ clips_audio = [AudioFileClip(p) for p in tts_paths]
137
+ narration = concatenate_audioclips(clips_audio)
138
+ total_dur = narration.duration
139
+
140
+ # 4) pick one random subclip
141
+ src = VideoFileClip(SOURCE_VIDEO_PATH)
142
+ start = random.uniform(0, max(0, src.duration - total_dur))
143
+ video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
144
+ src.close()
145
+
146
+ # 5) overlay narration
147
+ video = video.set_audio(narration)
148
+
149
+ # 6) optional subtitles
150
+ if include_captions:
151
+ subs = add_pillow_subtitles(video, sections)
152
+ video = CompositeVideoClip([video, *subs])
153
+
154
+ # 7) optional background music
155
+ if music_file:
156
+ bg = AudioFileClip(music_file)
157
+ # loop or trim
158
+ loops = math.ceil(video.duration / bg.duration)
159
+ bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
160
+ mixed = CompositeAudioClip([video.audio, bg])
161
+ video = video.set_audio(mixed)
162
+
163
+ # 8) export
164
+ video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')
165
+
166
+ # cleanup
167
+ shutil.rmtree(tmp)
168
+ return OUTPUT_VIDEO_PATH
169
+
170
+ # ────────── GRADIO UI ──────────
171
+ VOICE_MAP = {
172
+ 'Emma (Female)': 'af_heart',
173
+ 'Bella (Female)': 'af_bella',
174
+ # … add others as needed …
175
+ }
176
+
177
  iface = gr.Interface(
178
  fn=generate_video,
179
  inputs=[
180
+ gr.Textbox(label="Video Concept"),
181
+ gr.Checkbox(label="Include Captions"),
182
+ gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
183
+ gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
184
  ],
185
  outputs=gr.Video(label="Generated Video"),
186
+ title="AI Documentary Video Generator",
187
+ description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
188
  )
189
 
190
  if __name__ == "__main__":
191
+ iface.launch(share=True)