Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
from diffusers import StableDiffusionPipeline | |
from transformers import pipeline | |
from TTS.api import TTS | |
import moviepy.editor as mp | |
import numpy as np | |
import os | |
from PIL import Image, ImageDraw, ImageFont | |
import shlex | |
import subprocess | |
import spaces | |
subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl')) | |
def generate_script(topic): | |
"""Uses an open-source LLM to generate an engaging script of 8-10 minutes.""" | |
llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview") | |
prompt = (f"Write an engaging and informative script on the topic '{topic}'. " | |
"The text should take about 8-10 minutes to read aloud at a normal pace.") | |
response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7) | |
return response[0]['generated_text'] | |
def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20): | |
"""Creates a title image with auto-adjusting text size to fit within the image.""" | |
title_img = Image.new("RGB", image_size, (0, 0, 0)) | |
draw = ImageDraw.Draw(title_img) | |
# Load the maximum font size | |
font_size = max_font_size | |
try: | |
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) | |
except IOError: | |
font = ImageFont.load_default() | |
# Reduce font size until the text fits within the image | |
while font_size > min_font_size: | |
text_bbox = draw.textbbox((0, 0), text, font=font) | |
text_w = text_bbox[2] - text_bbox[0] | |
text_h = text_bbox[3] - text_bbox[1] | |
if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding: | |
break # Text fits, exit loop | |
font_size -= 2 # Decrease font size | |
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) | |
# Center the text | |
text_x = (image_size[0] - text_w) // 2 | |
text_y = (image_size[1] - text_h) // 2 | |
draw.text((text_x, text_y), text, font=font, fill="white") | |
return title_img | |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_duration = 0 | |
for word in words: | |
current_chunk.append(word) | |
current_duration += 1 / words_per_second | |
if current_duration >= min_sec: | |
if current_duration >= max_sec or len(current_chunk) > 20: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_duration = 0 | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def generate_speech(text): | |
tts = TTS("tts_models/en/ljspeech/glow-tts") | |
wav_path = "speech.wav" | |
tts.tts_to_file(text=text, file_path=wav_path) | |
return wav_path | |
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40): | |
image_paths = [] | |
if use_diffusion: | |
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") | |
pipe.to("cuda" if torch.cuda.is_available() else "cpu") | |
for i, chunk in enumerate(chunks): | |
if use_diffusion: | |
image = pipe(chunk, num_inference_steps=num_steps).images[0] | |
image = image.resize(image_size) | |
else: | |
image = Image.new("RGB", image_size, (0, 0, 0)) | |
draw = ImageDraw.Draw(image) | |
try: | |
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30) | |
except IOError: | |
font = ImageFont.load_default() | |
draw.text((10, 10), chunk, font=font, fill="white") | |
img_path = f"image_{i}.png" | |
image.save(img_path) | |
image_paths.append(img_path) | |
return image_paths | |
def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)): | |
clips = [] | |
# Title clip using PIL instead of ImageMagick | |
title_img = create_centered_title(image_size, movie_title) | |
title_img_path = "title.png" | |
title_img.save(title_img_path) | |
title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center') | |
clips.append(title_clip) | |
for img, dur, chunk in zip(images, durations, chunks): | |
frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS)) | |
clip = mp.ImageClip(frame).set_duration(dur) | |
clips.append(clip) | |
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2) | |
video = mp.concatenate_videoclips(clips + [black_end]) | |
audio = mp.AudioFileClip(speech_path) | |
final_video = video.set_audio(audio) | |
final_video.write_videofile("output.mp4", fps=24) | |
return "output.mp4" | |
def process_text(text, movie_title, image_size, use_diffusion, num_steps): | |
chunks = estimate_chunk_durations(text) | |
speech_path = generate_speech(text) | |
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps) | |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks] | |
video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size) | |
return video_path | |
with gr.Blocks() as demo: | |
gr.Markdown("# Text-to-Video Generator for YouTubers using AI 🎥") | |
gr.Markdown(""" | |
Turn your ideas into engaging videos effortlessly! 🎬 | |
Simply upload a text file or enter a topic, and our AI will generate a compelling script for you. | |
The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video. | |
To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video. | |
Perfect for content creators looking to streamline their workflow and focus on creativity! 🚀 | |
""") | |
text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)") | |
topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI") | |
movie_title_input = gr.Textbox(label="Movie Title", value="") | |
file_input = gr.File(label="Or upload a .txt file") | |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480") | |
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True) | |
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps") | |
process_btn = gr.Button("Generate Video") | |
output_video = gr.Video() | |
def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps): | |
if file is not None and hasattr(file, "name"): # Check if 'file' is a file object | |
text = open(file.name, "r").read() | |
elif not text and topic: | |
text = generate_script(topic) | |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)} | |
return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps) | |
process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video) | |
demo.launch(share=True) | |