HunyuanVideo / app.py
jonluca's picture
Update app.py
8564a6a verified
import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
import gradio as gr
import random
import spaces
import torch
from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.utils.preprocess_text_encoder_tokenizer_utils import preprocess_text_encoder_tokenizer
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
from hyvideo.constants import NEGATIVE_PROMPT
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video
from huggingface_hub import snapshot_download
# if torch.cuda.device_count() > 0:
# snapshot_download(repo_id="tencent/HunyuanVideo", repo_type="model", local_dir="ckpts", force_download=False)
# snapshot_download(repo_id="xtuner/llava-llama-3-8b-v1_1-transformers", repo_type="model", local_dir="ckpts/llava-llama-3-8b-v1_1-transformers", force_download=True)
# class Args:
# def __init__(self, input_dir, output_dir):
# self.input_dir = input_dir
# self.output_dir = output_dir
# # Create the object
# args = Args("ckpts/llava-llama-3-8b-v1_1-transformers", "ckpts/text_encoder")
# preprocess_text_encoder_tokenizer(args)
# snapshot_download(repo_id="openai/clip-vit-large-patch14", repo_type="model", local_dir="ckpts/text_encoder_2", force_download=True)
def initialize_model():
model_id = "hunyuanvideo-community/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id, subfolder="transformer", torch_dtype=torch.bfloat16
)
model = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16)
model.vae.enable_tiling()
model.to("cuda")
return model
model = initialize_model()
def generate_video(
prompt,
resolution,
video_length,
seed,
num_inference_steps,
guidance_scale,
flow_shift,
embedded_guidance_scale
):
print("generate_video (prompt: " + prompt + ")")
return generate_video_gpu(
model,
prompt,
resolution,
video_length,
seed,
num_inference_steps,
guidance_scale,
flow_shift,
embedded_guidance_scale
)
@spaces.GPU(duration=120)
def generate_video_gpu(
model,
prompt,
resolution,
video_length,
seed,
num_inference_steps,
guidance_scale,
flow_shift,
embedded_guidance_scale
):
print("generate_video_gpu (prompt: " + prompt + ")")
if torch.cuda.device_count() == 0:
gr.Warning("Set this space to GPU config to make it work.")
return None
seed = None if seed == -1 else seed
width, height = resolution.split("x")
width, height = int(width), int(height)
negative_prompt = "" # not applicable in the inference
print("Predicting video...")
generator = None
if seed is not None:
generator = torch.manual_seed(seed)
frames: List[PIL.Image.Image] = model(
prompt=prompt,
height=height,
width=width,
num_frames=video_length,
generator=generator,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale
).frames[0]
output_video = export_to_video(frames, fps=15)
return video_path
def create_demo(model_path):
with gr.Blocks() as demo:
if torch.cuda.device_count() == 0:
with gr.Row():
gr.HTML("""
<p style="background-color: red;"><big><big><big><b>⚠️To use <i>Hunyuan Video</i>, <a href="https://huggingface.co./spaces/Fabrice-TIERCELIN/HunyuanVideo?duplicate=true">duplicate this space</a> and set a GPU with 80 GB VRAM.</b>
You can't use <i>Hunyuan Video</i> directly here because this space runs on a CPU, which is not enough for <i>Hunyuan Video</i>. Please provide <a href="https://huggingface.co./spaces/Fabrice-TIERCELIN/HunyuanVideo/discussions/new">feedback</a> if you have issues.
</big></big></big></p>
""")
gr.Markdown("# Hunyuan Video Generation")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt", value="A cat walks on the grass, realistic style.")
with gr.Row():
resolution = gr.Dropdown(
choices=[
# 720p
("1280x720 (16:9, 720p)", "1280x720"),
("720x1280 (9:16, 720p)", "720x1280"),
("1104x832 (4:3, 720p)", "1104x832"),
("832x1104 (3:4, 720p)", "832x1104"),
("960x960 (1:1, 720p)", "960x960"),
# 540p
("960x544 (16:9, 540p)", "960x544"),
("544x960 (9:16, 540p)", "544x960"),
("832x624 (4:3, 540p)", "832x624"),
("624x832 (3:4, 540p)", "624x832"),
("720x720 (1:1, 540p)", "720x720"),
],
value="832x624",
label="Resolution"
)
video_length = gr.Dropdown(
label="Video Length",
choices=[
("2s(65f)", 65),
("5s(129f)", 129),
],
value=65,
)
num_inference_steps = gr.Slider(1, 100, value=5, step=1, label="Number of Inference Steps")
with gr.Accordion("Advanced Options", open=False):
with gr.Column():
seed = gr.Slider(label="Seed (-1 for random)", value=-1, minimum=-1, maximum=2**63 - 1, step=1)
guidance_scale = gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Guidance Scale")
flow_shift = gr.Slider(0.0, 10.0, value=7.0, step=0.1, label="Flow Shift")
embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale")
generate_btn = gr.Button(value = "πŸš€ Generate Video", variant = "primary")
with gr.Row():
output = gr.Video(label = "Generated Video", autoplay = True)
gr.Markdown("""
## **Alternatives**
If you can't use _Hunyuan Video_, you can use _[CogVideoX](https://huggingface.co./spaces/THUDM/CogVideoX-5B-Space)_ or _[LTX Video Playground](https://huggingface.co./spaces/Lightricks/LTX-Video-Playground)_ instead.
""")
generate_btn.click(
fn=generate_video,
inputs=[
prompt,
resolution,
video_length,
seed,
num_inference_steps,
guidance_scale,
flow_shift,
embedded_guidance_scale
],
outputs=output
)
return demo
if __name__ == "__main__":
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
demo = create_demo("ckpts")
demo.queue(10).launch()