Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,382 Bytes
3920bf8 96714cf bb63567 96714cf d22b9a3 49f5320 3920bf8 49f5320 0c4b9f8 49f5320 bb63567 a2f1d3e bb63567 4683bc9 872b88b 96714cf 49f5320 96714cf 10d615f 96714cf 49f5320 d9d7e9a 96714cf dc44a16 96714cf dc44a16 d22b9a3 96714cf 4683bc9 e952ea3 d22b9a3 4683bc9 d22b9a3 db78349 d9d7e9a 4683bc9 e952ea3 db78349 a43fcb6 db78349 96714cf bce1eba 96714cf 4683bc9 96714cf dc44a16 96714cf 4683bc9 96714cf 4683bc9 88de36b bb63567 db78349 96714cf cd94723 dc44a16 a2f1d3e 96714cf bb63567 0f393e6 96714cf bb63567 cd94723 9e9a0b2 92077c5 96714cf 9e9a0b2 3920bf8 b46a641 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from transformers import pipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image, ImageDraw, ImageFont
import shlex
import subprocess
import spaces
subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl'))
@spaces.GPU
def generate_script(topic):
"""Uses an open-source LLM to generate an engaging script of 8-10 minutes."""
llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview")
prompt = (f"Write an engaging and informative script on the topic '{topic}'. "
"The text should take about 8-10 minutes to read aloud at a normal pace.")
response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7)
return response[0]['generated_text']
def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20):
"""Creates a title image with auto-adjusting text size to fit within the image."""
title_img = Image.new("RGB", image_size, (0, 0, 0))
draw = ImageDraw.Draw(title_img)
# Load the maximum font size
font_size = max_font_size
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
except IOError:
font = ImageFont.load_default()
# Reduce font size until the text fits within the image
while font_size > min_font_size:
text_bbox = draw.textbbox((0, 0), text, font=font)
text_w = text_bbox[2] - text_bbox[0]
text_h = text_bbox[3] - text_bbox[1]
if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding:
break # Text fits, exit loop
font_size -= 2 # Decrease font size
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
# Center the text
text_x = (image_size[0] - text_w) // 2
text_y = (image_size[1] - text_h) // 2
draw.text((text_x, text_y), text, font=font, fill="white")
return title_img
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
words = text.split()
chunks = []
current_chunk = []
current_duration = 0
for word in words:
current_chunk.append(word)
current_duration += 1 / words_per_second
if current_duration >= min_sec:
if current_duration >= max_sec or len(current_chunk) > 20:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_duration = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
@spaces.GPU
def generate_speech(text):
tts = TTS("tts_models/en/ljspeech/glow-tts")
wav_path = "speech.wav"
tts.tts_to_file(text=text, file_path=wav_path)
return wav_path
@spaces.GPU
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
image_paths = []
if use_diffusion:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
for i, chunk in enumerate(chunks):
if use_diffusion:
image = pipe(chunk, num_inference_steps=num_steps).images[0]
image = image.resize(image_size)
else:
image = Image.new("RGB", image_size, (0, 0, 0))
draw = ImageDraw.Draw(image)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
except IOError:
font = ImageFont.load_default()
draw.text((10, 10), chunk, font=font, fill="white")
img_path = f"image_{i}.png"
image.save(img_path)
image_paths.append(img_path)
return image_paths
def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)):
clips = []
# Title clip using PIL instead of ImageMagick
title_img = create_centered_title(image_size, movie_title)
title_img_path = "title.png"
title_img.save(title_img_path)
title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center')
clips.append(title_clip)
for img, dur, chunk in zip(images, durations, chunks):
frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
clip = mp.ImageClip(frame).set_duration(dur)
clips.append(clip)
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
video = mp.concatenate_videoclips(clips + [black_end])
audio = mp.AudioFileClip(speech_path)
final_video = video.set_audio(audio)
final_video.write_videofile("output.mp4", fps=24)
return "output.mp4"
def process_text(text, movie_title, image_size, use_diffusion, num_steps):
chunks = estimate_chunk_durations(text)
speech_path = generate_speech(text)
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size)
return video_path
with gr.Blocks() as demo:
gr.Markdown("# Text-to-Video Generator for YouTubers using AI π₯")
gr.Markdown("""
Turn your ideas into engaging videos effortlessly! π¬
Simply upload a text file or enter a topic, and our AI will generate a compelling script for you.
The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video.
To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video.
Perfect for content creators looking to streamline their workflow and focus on creativity! π
""")
text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)")
topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI")
movie_title_input = gr.Textbox(label="Movie Title", value="")
file_input = gr.File(label="Or upload a .txt file")
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps")
process_btn = gr.Button("Generate Video")
output_video = gr.Video()
def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps):
if file is not None and hasattr(file, "name"): # Check if 'file' is a file object
text = open(file.name, "r").read()
elif not text and topic:
text = generate_script(topic)
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps)
process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)
demo.launch(share=True)
|