Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,210 +1,191 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
import
|
4 |
import random
|
|
|
5 |
import shutil
|
6 |
-
|
7 |
-
from PIL import Image, ImageDraw, ImageFont
|
8 |
import numpy as np
|
9 |
-
import
|
|
|
|
|
|
|
10 |
import gradio as gr
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
for line in script_text.splitlines():
|
46 |
-
|
47 |
-
if
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
|
66 |
-
try:
|
67 |
-
tts = gTTS(text=text, lang='en')
|
68 |
-
mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
|
69 |
-
tts.save(mp3_path)
|
70 |
-
from pydub import AudioSegment
|
71 |
-
audio = AudioSegment.from_mp3(mp3_path)
|
72 |
-
audio.export(file_path, format="wav")
|
73 |
-
os.remove(mp3_path)
|
74 |
-
print(f"TTS audio saved to {file_path}")
|
75 |
-
return file_path
|
76 |
-
except Exception as e:
|
77 |
-
print(f"TTS generation failed: {e}")
|
78 |
-
return None
|
79 |
-
|
80 |
-
def get_audio_duration(audio_path):
|
81 |
-
"""Get the duration of the audio file."""
|
82 |
-
audio = AudioFileClip(audio_path)
|
83 |
-
duration = audio.duration
|
84 |
-
audio.close()
|
85 |
-
return duration
|
86 |
-
|
87 |
-
def generate_subtitle_image(text, font_path="arial.ttf", font_size=45, text_color=(255, 255, 255, 255), bg_color=(0, 0, 0, 64), size=(1080, 200)):
|
88 |
-
"""Generate a subtitle image with wrapped text."""
|
89 |
-
img = Image.new('RGBA', size, (0, 0, 0, 0)) # Transparent background
|
90 |
-
draw = ImageDraw.Draw(img)
|
91 |
-
if bg_color:
|
92 |
-
draw.rectangle([(0, 0), size], fill=bg_color)
|
93 |
try:
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
return
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
mixed_audio = CompositeAudioClip([video_audio, bg_music])
|
122 |
-
video_clip = video_clip.set_audio(mixed_audio)
|
123 |
-
return video_clip
|
124 |
-
|
125 |
-
def generate_video(user_input, include_captions):
|
126 |
-
"""Generate a video with voiceover and optional captions."""
|
127 |
-
global TEMP_FOLDER
|
128 |
-
TEMP_FOLDER = tempfile.mkdtemp()
|
129 |
-
|
130 |
-
# Generate and parse script
|
131 |
-
script = generate_dummy_script() # Using dummy script for 64s duration
|
132 |
-
full_narration = parse_script(script)
|
133 |
-
print("Full Narration:", full_narration)
|
134 |
-
|
135 |
-
# Generate voiceover
|
136 |
-
tts_path = generate_tts(full_narration)
|
137 |
-
if not tts_path:
|
138 |
-
shutil.rmtree(TEMP_FOLDER)
|
139 |
-
return None
|
140 |
-
|
141 |
-
# Assuming voiceover is 64 seconds as per requirement
|
142 |
-
audio_duration = 64 # Hardcoded for this example
|
143 |
-
video_duration = audio_duration + 0.5 # 64.5 seconds clip
|
144 |
-
|
145 |
-
# Cut video clip from 13-minute video
|
146 |
-
video_path = "video.mp4"
|
147 |
-
if not os.path.exists(video_path):
|
148 |
-
print("video.mp4 not found.")
|
149 |
-
shutil.rmtree(TEMP_FOLDER)
|
150 |
-
return None
|
151 |
-
|
152 |
-
long_video = VideoFileClip(video_path)
|
153 |
-
total_duration = long_video.duration # 13 minutes = 780 seconds
|
154 |
-
if total_duration < video_duration:
|
155 |
-
print("Video is too short.")
|
156 |
-
long_video.close()
|
157 |
-
shutil.rmtree(TEMP_FOLDER)
|
158 |
-
return None
|
159 |
-
|
160 |
-
start_time = random.uniform(0, total_duration - video_duration)
|
161 |
-
video_clip = long_video.subclip(start_time, start_time + video_duration)
|
162 |
-
long_video.close()
|
163 |
-
|
164 |
-
# Set voiceover audio
|
165 |
-
video_clip = video_clip.set_audio(AudioFileClip(tts_path))
|
166 |
-
|
167 |
-
# Add captions if requested
|
168 |
-
if include_captions == "Yes":
|
169 |
-
words = full_narration.split()
|
170 |
-
num_words = len(words)
|
171 |
-
word_duration = audio_duration / num_words # Timing based on audio_duration
|
172 |
-
chunks = [words[i:i+5] for i in range(0, num_words, 5)]
|
173 |
-
subtitle_clips = []
|
174 |
for i, chunk in enumerate(chunks):
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
iface = gr.Interface(
|
199 |
fn=generate_video,
|
200 |
inputs=[
|
201 |
-
gr.Textbox(label="Video Concept"
|
202 |
-
gr.
|
|
|
|
|
203 |
],
|
204 |
outputs=gr.Video(label="Generated Video"),
|
205 |
-
title="Video Generator",
|
206 |
-
description="
|
207 |
)
|
208 |
|
209 |
if __name__ == "__main__":
|
210 |
-
iface.launch()
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
+
import math
|
4 |
import random
|
5 |
+
import tempfile
|
6 |
import shutil
|
7 |
+
import requests
|
|
|
8 |
import numpy as np
|
9 |
+
from kokoro import KPipeline
|
10 |
+
import soundfile as sf
|
11 |
+
from pydub import AudioSegment
|
12 |
+
from gtts import gTTS
|
13 |
import gradio as gr
|
14 |
+
from moviepy.editor import (
|
15 |
+
VideoFileClip, AudioFileClip, concatenate_audioclips,
|
16 |
+
CompositeAudioClip, CompositeVideoClip, TextClip
|
17 |
+
)
|
18 |
|
19 |
+
# ββββββββββ GLOBAL CONFIG ββββββββββ
|
20 |
+
OPENROUTER_API_KEY = 'sk-or-v1-β¦'
|
21 |
+
OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
|
22 |
+
SOURCE_VIDEO_PATH = "video.mp4" # 13-min source
|
23 |
+
OUTPUT_VIDEO_PATH = "final_video.mp4"
|
24 |
+
TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style
|
25 |
+
VOICE_SPEED = 0.9
|
26 |
+
CAPTION_FONT_SIZE = 45
|
27 |
+
BG_MUSIC_VOLUME = 0.08
|
28 |
+
|
29 |
+
# Initialize Kokoro TTS
|
30 |
+
pipeline = KPipeline(lang_code='a') # American English
|
31 |
+
|
32 |
+
# ββββββββββ SCRIPT GENERATION ββββββββββ
|
33 |
+
def generate_script(topic: str) -> str:
|
34 |
+
headers = {
|
35 |
+
'Authorization': f'Bearer {OPENROUTER_API_KEY}',
|
36 |
+
'HTTP-Referer': 'https://your-domain.com',
|
37 |
+
'X-Title': 'AI Documentary Maker'
|
38 |
+
}
|
39 |
+
prompt = f"""Youβre a professional documentary narrator.
|
40 |
+
Break your script into scenes with [Tags], one sentence each (β€12 words).
|
41 |
+
No slang or numbers. At the end, include [Subscribe] with a formal reason.
|
42 |
+
|
43 |
+
Topic: {topic}
|
44 |
+
"""
|
45 |
+
payload = {
|
46 |
+
'model': OPENROUTER_MODEL,
|
47 |
+
'messages': [{'role':'user','content':prompt}],
|
48 |
+
'temperature':0.4,
|
49 |
+
'max_tokens':5000
|
50 |
+
}
|
51 |
+
r = requests.post('https://openrouter.ai/api/v1/chat/completions',
|
52 |
+
headers=headers, json=payload, timeout=30)
|
53 |
+
r.raise_for_status()
|
54 |
+
return r.json()['choices'][0]['message']['content']
|
55 |
+
|
56 |
+
def parse_script(script_text: str):
|
57 |
+
"""Return list of (scene_title, sentence_text)."""
|
58 |
+
sections = []
|
59 |
+
current = None
|
60 |
for line in script_text.splitlines():
|
61 |
+
m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
|
62 |
+
if m:
|
63 |
+
if current:
|
64 |
+
sections.append(current)
|
65 |
+
current = [m.group(1), m.group(2)]
|
66 |
+
elif current and line.strip():
|
67 |
+
current[1] += ' ' + line.strip()
|
68 |
+
if current:
|
69 |
+
sections.append(current)
|
70 |
+
return sections
|
71 |
+
|
72 |
+
# ββββββββββ TTS ββββββββββ
|
73 |
+
def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
|
74 |
+
"""Produce a WAV file, using Kokoro then gTTS fallback."""
|
75 |
+
safe = re.sub(r'[^\w]', '_', text[:10]).strip()
|
76 |
+
out_path = os.path.join(dirpath, f"tts_{safe}.wav")
|
77 |
+
if os.path.exists(out_path):
|
78 |
+
return out_path
|
79 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
try:
|
81 |
+
# Kokoro pipeline returns a sequence of numpy audio arrays
|
82 |
+
segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
|
83 |
+
arrays = [seg_audio for _, _, seg_audio in segments]
|
84 |
+
combined = np.concatenate(arrays, axis=0)
|
85 |
+
sf.write(out_path, combined, 24000)
|
86 |
+
except Exception:
|
87 |
+
# fallback to gTTS
|
88 |
+
mp3 = os.path.join(dirpath, f"{safe}.mp3")
|
89 |
+
gTTS(text=text, lang='en').save(mp3)
|
90 |
+
wav = AudioSegment.from_mp3(mp3)
|
91 |
+
wav.export(out_path, format="wav")
|
92 |
+
os.remove(mp3)
|
93 |
+
return out_path
|
94 |
+
|
95 |
+
# ββββββββββ VIDEO + SUBTITLES ββββββββββ
|
96 |
+
def add_pillow_subtitles(clip, sections):
|
97 |
+
"""Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
|
98 |
+
subtitles = []
|
99 |
+
cum_time = 0
|
100 |
+
for title, sentence in sections:
|
101 |
+
audio_path = clip.audio # duration only
|
102 |
+
# split sentence into 5-word chunks
|
103 |
+
words = sentence.split()
|
104 |
+
chunks = [words[i:i+5] for i in range(0, len(words), 5)]
|
105 |
+
seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
|
106 |
+
# approximate each chunk duration
|
107 |
+
chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
for i, chunk in enumerate(chunks):
|
109 |
+
txt = ' '.join(chunk)
|
110 |
+
txt_clip = (
|
111 |
+
TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
|
112 |
+
color='white', bg_color='rgba(0,0,0,0.3)',
|
113 |
+
size=(TARGET_RESOLUTION[0]*0.9, None),
|
114 |
+
method='pillow')
|
115 |
+
.set_start(cum_time + i*chunk_dur)
|
116 |
+
.set_duration(chunk_dur)
|
117 |
+
.set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
|
118 |
+
)
|
119 |
+
subtitles.append(txt_clip)
|
120 |
+
cum_time += seg_dur
|
121 |
+
return subtitles
|
122 |
+
|
123 |
+
def generate_video(topic, include_captions, music_file, voice_choice):
|
124 |
+
if not os.path.exists(SOURCE_VIDEO_PATH):
|
125 |
+
raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
|
126 |
+
# 1) get script & sections
|
127 |
+
script = generate_script(topic)
|
128 |
+
sections = parse_script(script)
|
129 |
+
# 2) TTS each sentence
|
130 |
+
tmp = tempfile.mkdtemp()
|
131 |
+
tts_paths = [
|
132 |
+
generate_tts_audio(sentence, voice_choice, tmp)
|
133 |
+
for _, sentence in sections
|
134 |
+
]
|
135 |
+
# 3) concatenate all TTS audios
|
136 |
+
clips_audio = [AudioFileClip(p) for p in tts_paths]
|
137 |
+
narration = concatenate_audioclips(clips_audio)
|
138 |
+
total_dur = narration.duration
|
139 |
+
|
140 |
+
# 4) pick one random subclip
|
141 |
+
src = VideoFileClip(SOURCE_VIDEO_PATH)
|
142 |
+
start = random.uniform(0, max(0, src.duration - total_dur))
|
143 |
+
video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
|
144 |
+
src.close()
|
145 |
+
|
146 |
+
# 5) overlay narration
|
147 |
+
video = video.set_audio(narration)
|
148 |
+
|
149 |
+
# 6) optional subtitles
|
150 |
+
if include_captions:
|
151 |
+
subs = add_pillow_subtitles(video, sections)
|
152 |
+
video = CompositeVideoClip([video, *subs])
|
153 |
+
|
154 |
+
# 7) optional background music
|
155 |
+
if music_file:
|
156 |
+
bg = AudioFileClip(music_file)
|
157 |
+
# loop or trim
|
158 |
+
loops = math.ceil(video.duration / bg.duration)
|
159 |
+
bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
|
160 |
+
mixed = CompositeAudioClip([video.audio, bg])
|
161 |
+
video = video.set_audio(mixed)
|
162 |
+
|
163 |
+
# 8) export
|
164 |
+
video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')
|
165 |
+
|
166 |
+
# cleanup
|
167 |
+
shutil.rmtree(tmp)
|
168 |
+
return OUTPUT_VIDEO_PATH
|
169 |
+
|
170 |
+
# ββββββββββ GRADIO UI ββββββββββ
|
171 |
+
VOICE_MAP = {
|
172 |
+
'Emma (Female)': 'af_heart',
|
173 |
+
'Bella (Female)': 'af_bella',
|
174 |
+
# β¦ add others as needed β¦
|
175 |
+
}
|
176 |
+
|
177 |
iface = gr.Interface(
|
178 |
fn=generate_video,
|
179 |
inputs=[
|
180 |
+
gr.Textbox(label="Video Concept"),
|
181 |
+
gr.Checkbox(label="Include Captions"),
|
182 |
+
gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
|
183 |
+
gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
|
184 |
],
|
185 |
outputs=gr.Video(label="Generated Video"),
|
186 |
+
title="AI Documentary Video Generator",
|
187 |
+
description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
|
188 |
)
|
189 |
|
190 |
if __name__ == "__main__":
|
191 |
+
iface.launch(share=True)
|