text-to-video / app.py
nezihtopaloglu's picture
Trying to solve ZeroGPU runtime issues
49f5320 verified
import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from transformers import pipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image, ImageDraw, ImageFont
import shlex
import subprocess
import spaces
subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl'))
@spaces.GPU
def generate_script(topic):
"""Uses an open-source LLM to generate an engaging script of 8-10 minutes."""
llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview")
prompt = (f"Write an engaging and informative script on the topic '{topic}'. "
"The text should take about 8-10 minutes to read aloud at a normal pace.")
response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7)
return response[0]['generated_text']
def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20):
"""Creates a title image with auto-adjusting text size to fit within the image."""
title_img = Image.new("RGB", image_size, (0, 0, 0))
draw = ImageDraw.Draw(title_img)
# Load the maximum font size
font_size = max_font_size
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
except IOError:
font = ImageFont.load_default()
# Reduce font size until the text fits within the image
while font_size > min_font_size:
text_bbox = draw.textbbox((0, 0), text, font=font)
text_w = text_bbox[2] - text_bbox[0]
text_h = text_bbox[3] - text_bbox[1]
if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding:
break # Text fits, exit loop
font_size -= 2 # Decrease font size
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
# Center the text
text_x = (image_size[0] - text_w) // 2
text_y = (image_size[1] - text_h) // 2
draw.text((text_x, text_y), text, font=font, fill="white")
return title_img
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
words = text.split()
chunks = []
current_chunk = []
current_duration = 0
for word in words:
current_chunk.append(word)
current_duration += 1 / words_per_second
if current_duration >= min_sec:
if current_duration >= max_sec or len(current_chunk) > 20:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_duration = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
@spaces.GPU
def generate_speech(text):
tts = TTS("tts_models/en/ljspeech/glow-tts")
wav_path = "speech.wav"
tts.tts_to_file(text=text, file_path=wav_path)
return wav_path
@spaces.GPU
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
image_paths = []
if use_diffusion:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
for i, chunk in enumerate(chunks):
if use_diffusion:
image = pipe(chunk, num_inference_steps=num_steps).images[0]
image = image.resize(image_size)
else:
image = Image.new("RGB", image_size, (0, 0, 0))
draw = ImageDraw.Draw(image)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
except IOError:
font = ImageFont.load_default()
draw.text((10, 10), chunk, font=font, fill="white")
img_path = f"image_{i}.png"
image.save(img_path)
image_paths.append(img_path)
return image_paths
def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)):
clips = []
# Title clip using PIL instead of ImageMagick
title_img = create_centered_title(image_size, movie_title)
title_img_path = "title.png"
title_img.save(title_img_path)
title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center')
clips.append(title_clip)
for img, dur, chunk in zip(images, durations, chunks):
frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
clip = mp.ImageClip(frame).set_duration(dur)
clips.append(clip)
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
video = mp.concatenate_videoclips(clips + [black_end])
audio = mp.AudioFileClip(speech_path)
final_video = video.set_audio(audio)
final_video.write_videofile("output.mp4", fps=24)
return "output.mp4"
def process_text(text, movie_title, image_size, use_diffusion, num_steps):
chunks = estimate_chunk_durations(text)
speech_path = generate_speech(text)
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size)
return video_path
with gr.Blocks() as demo:
gr.Markdown("# Text-to-Video Generator for YouTubers using AI 🎥")
gr.Markdown("""
Turn your ideas into engaging videos effortlessly! 🎬
Simply upload a text file or enter a topic, and our AI will generate a compelling script for you.
The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video.
To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video.
Perfect for content creators looking to streamline their workflow and focus on creativity! 🚀
""")
text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)")
topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI")
movie_title_input = gr.Textbox(label="Movie Title", value="")
file_input = gr.File(label="Or upload a .txt file")
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps")
process_btn = gr.Button("Generate Video")
output_video = gr.Video()
def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps):
if file is not None and hasattr(file, "name"): # Check if 'file' is a file object
text = open(file.name, "r").read()
elif not text and topic:
text = generate_script(topic)
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps)
process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
demo.launch(share=True)