text-to-video

Running

App Files Files Community

text-to-video / app.py

nezihtopaloglu

Trying to solve ZeroGPU runtime issues

49f5320 verified 2 months ago

raw

history blame contribute delete

7.38 kB

	import gradio as gr
	import torch
	import torchaudio
	from diffusers import StableDiffusionPipeline
	from transformers import pipeline
	from TTS.api import TTS
	import moviepy.editor as mp
	import numpy as np
	import os
	from PIL import Image, ImageDraw, ImageFont
	import shlex
	import subprocess
	import spaces

	subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl'))


	@spaces.GPU
	def generate_script(topic):
	"""Uses an open-source LLM to generate an engaging script of 8-10 minutes."""
	llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview")
	prompt = (f"Write an engaging and informative script on the topic '{topic}'. "
	"The text should take about 8-10 minutes to read aloud at a normal pace.")
	response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7)
	return response[0]['generated_text']

	def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20):
	"""Creates a title image with auto-adjusting text size to fit within the image."""
	title_img = Image.new("RGB", image_size, (0, 0, 0))
	draw = ImageDraw.Draw(title_img)

	# Load the maximum font size
	font_size = max_font_size
	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
	except IOError:
	font = ImageFont.load_default()

	# Reduce font size until the text fits within the image
	while font_size > min_font_size:
	text_bbox = draw.textbbox((0, 0), text, font=font)
	text_w = text_bbox[2] - text_bbox[0]
	text_h = text_bbox[3] - text_bbox[1]

	if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding:
	break # Text fits, exit loop

	font_size -= 2 # Decrease font size
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)

	# Center the text
	text_x = (image_size[0] - text_w) // 2
	text_y = (image_size[1] - text_h) // 2
	draw.text((text_x, text_y), text, font=font, fill="white")

	return title_img

	def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
	words = text.split()
	chunks = []
	current_chunk = []
	current_duration = 0
	for word in words:
	current_chunk.append(word)
	current_duration += 1 / words_per_second
	if current_duration >= min_sec:
	if current_duration >= max_sec or len(current_chunk) > 20:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_duration = 0
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	@spaces.GPU
	def generate_speech(text):
	tts = TTS("tts_models/en/ljspeech/glow-tts")
	wav_path = "speech.wav"
	tts.tts_to_file(text=text, file_path=wav_path)
	return wav_path

	@spaces.GPU
	def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
	image_paths = []
	if use_diffusion:
	pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
	pipe.to("cuda" if torch.cuda.is_available() else "cpu")

	for i, chunk in enumerate(chunks):
	if use_diffusion:
	image = pipe(chunk, num_inference_steps=num_steps).images[0]
	image = image.resize(image_size)
	else:
	image = Image.new("RGB", image_size, (0, 0, 0))
	draw = ImageDraw.Draw(image)
	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
	except IOError:
	font = ImageFont.load_default()
	draw.text((10, 10), chunk, font=font, fill="white")

	img_path = f"image_{i}.png"
	image.save(img_path)
	image_paths.append(img_path)
	return image_paths

	def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)):
	clips = []

	# Title clip using PIL instead of ImageMagick
	title_img = create_centered_title(image_size, movie_title)

	title_img_path = "title.png"
	title_img.save(title_img_path)

	title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center')
	clips.append(title_clip)

	for img, dur, chunk in zip(images, durations, chunks):
	frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
	clip = mp.ImageClip(frame).set_duration(dur)

	clips.append(clip)

	black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
	video = mp.concatenate_videoclips(clips + [black_end])
	audio = mp.AudioFileClip(speech_path)
	final_video = video.set_audio(audio)
	final_video.write_videofile("output.mp4", fps=24)
	return "output.mp4"

	def process_text(text, movie_title, image_size, use_diffusion, num_steps):
	chunks = estimate_chunk_durations(text)
	speech_path = generate_speech(text)
	image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
	durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
	video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size)
	return video_path

	with gr.Blocks() as demo:
	gr.Markdown("# Text-to-Video Generator for YouTubers using AI 🎥")
	gr.Markdown("""
	Turn your ideas into engaging videos effortlessly! 🎬
	Simply upload a text file or enter a topic, and our AI will generate a compelling script for you.
	The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video.
	To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video.
	Perfect for content creators looking to streamline their workflow and focus on creativity! 🚀
	""")

	text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)")
	topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI")

	movie_title_input = gr.Textbox(label="Movie Title", value="")
	file_input = gr.File(label="Or upload a .txt file")
	image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
	use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
	num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps")
	process_btn = gr.Button("Generate Video")
	output_video = gr.Video()

	def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps):
	if file is not None and hasattr(file, "name"): # Check if 'file' is a file object
	text = open(file.name, "r").read()
	elif not text and topic:
	text = generate_script(topic)
	image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}

	return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps)

	process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video)

	demo.launch(share=True)