Spaces:

nezihtopaloglu
/

text-to-video

Running on Zero

File size: 7,382 Bytes

3920bf8
96714cf
 
 
bb63567
96714cf
 
 
 
d22b9a3
49f5320
 
 
3920bf8
49f5320
0c4b9f8
 
49f5320
bb63567
 
a2f1d3e
bb63567
 
 
 
 
4683bc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872b88b
96714cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49f5320
96714cf
10d615f
96714cf
 
 
 
49f5320
d9d7e9a
96714cf
dc44a16
 
 
 
96714cf
dc44a16
 
 
 
 
d22b9a3
 
 
 
 
 
 
96714cf
 
 
 
 
4683bc9
e952ea3
d22b9a3
 
4683bc9
 
d22b9a3
 
 
 
 
db78349
 
 
d9d7e9a
4683bc9
e952ea3
db78349
a43fcb6
db78349
96714cf
 
bce1eba
96714cf
 
4683bc9
96714cf
 
dc44a16
96714cf
4683bc9
96714cf
 
 
4683bc9
88de36b
 
 
 
 
 
 
 
bb63567
 
 
db78349
96714cf
cd94723
dc44a16
a2f1d3e
96714cf
 
 
bb63567
0f393e6
96714cf
bb63567
 
cd94723
9e9a0b2
92077c5
96714cf
9e9a0b2
3920bf8
b46a641

import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from transformers import pipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image, ImageDraw, ImageFont
import shlex
import subprocess
import spaces

subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl'))


@spaces.GPU
def generate_script(topic):
    """Uses an open-source LLM to generate an engaging script of 8-10 minutes."""
    llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview")
    prompt = (f"Write an engaging and informative script on the topic '{topic}'. "
              "The text should take about 8-10 minutes to read aloud at a normal pace.")
    response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7)
    return response[0]['generated_text']

def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20):
    """Creates a title image with auto-adjusting text size to fit within the image."""
    title_img = Image.new("RGB", image_size, (0, 0, 0))
    draw = ImageDraw.Draw(title_img)

    # Load the maximum font size
    font_size = max_font_size
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
    except IOError:
        font = ImageFont.load_default()

    # Reduce font size until the text fits within the image
    while font_size > min_font_size:
        text_bbox = draw.textbbox((0, 0), text, font=font)
        text_w = text_bbox[2] - text_bbox[0]
        text_h = text_bbox[3] - text_bbox[1]

        if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding:
            break  # Text fits, exit loop

        font_size -= 2  # Decrease font size
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)

    # Center the text
    text_x = (image_size[0] - text_w) // 2
    text_y = (image_size[1] - text_h) // 2
    draw.text((text_x, text_y), text, font=font, fill="white")

    return title_img

def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
    words = text.split()
    chunks = []
    current_chunk = []
    current_duration = 0
    for word in words:
        current_chunk.append(word)
        current_duration += 1 / words_per_second
        if current_duration >= min_sec:
            if current_duration >= max_sec or len(current_chunk) > 20:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_duration = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

@spaces.GPU
def generate_speech(text):
    tts = TTS("tts_models/en/ljspeech/glow-tts")
    wav_path = "speech.wav"
    tts.tts_to_file(text=text, file_path=wav_path)
    return wav_path

@spaces.GPU
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
    image_paths = []
    if use_diffusion:
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.to("cuda" if torch.cuda.is_available() else "cpu")
    
    for i, chunk in enumerate(chunks):
        if use_diffusion:
            image = pipe(chunk, num_inference_steps=num_steps).images[0]
            image = image.resize(image_size)
        else:
            image = Image.new("RGB", image_size, (0, 0, 0))
            draw = ImageDraw.Draw(image)
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
            except IOError:
                font = ImageFont.load_default()
            draw.text((10, 10), chunk, font=font, fill="white")

        img_path = f"image_{i}.png"
        image.save(img_path)
        image_paths.append(img_path)
    return image_paths

def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)):
    clips = []
    
    # Title clip using PIL instead of ImageMagick
    title_img = create_centered_title(image_size, movie_title)

    title_img_path = "title.png"
    title_img.save(title_img_path)

    title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center')
    clips.append(title_clip)
    
    for img, dur, chunk in zip(images, durations, chunks):
        frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
        clip = mp.ImageClip(frame).set_duration(dur)

        clips.append(clip)
    
    black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
    video = mp.concatenate_videoclips(clips + [black_end])
    audio = mp.AudioFileClip(speech_path)
    final_video = video.set_audio(audio)
    final_video.write_videofile("output.mp4", fps=24)
    return "output.mp4"

def process_text(text, movie_title, image_size, use_diffusion, num_steps):
    chunks = estimate_chunk_durations(text)
    speech_path = generate_speech(text)
    image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
    durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
    video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size)
    return video_path

with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Video Generator for YouTubers using AI 🎥")
    gr.Markdown("""
    Turn your ideas into engaging videos effortlessly! 🎬  
    Simply upload a text file or enter a topic, and our AI will generate a compelling script for you.  
    The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video.  
    To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video.  
    Perfect for content creators looking to streamline their workflow and focus on creativity! 🚀  
    """)

    text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)")
    topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI")

    movie_title_input = gr.Textbox(label="Movie Title", value="")
    file_input = gr.File(label="Or upload a .txt file")
    image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
    use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
    num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps")
    process_btn = gr.Button("Generate Video")
    output_video = gr.Video()
    
    def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps):
        if file is not None and hasattr(file, "name"):  # Check if 'file' is a file object
            text = open(file.name, "r").read()
        elif not text and topic:
            text = generate_script(topic)
        image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
        
        return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps)
    
    process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)

demo.launch(share=True)