import gradio as gr from pathlib import Path import argparse,os from datetime import datetime import librosa from infer import load_models,main import spaces try: import torch if torch.cuda.is_available(): _ = torch.tensor([0.0]).to('cuda') except Exception as e: print(f"GPU warmup failed: {e}") os.environ["GRADIO_TEMP_DIR"] = "./tmp" pipe,fantasytalking,wav2vec_processor,wav2vec = None,None,None,None @spaces.GPU(duration=1200) def generate_video( image_path, audio_path, prompt, prompt_cfg_scale, audio_cfg_scale, audio_weight, image_size, max_num_frames, inference_steps, seed, ): # Create the temp directory if it doesn't exist output_dir = Path("./output") output_dir.mkdir(parents=True, exist_ok=True) # Convert paths to absolute Path objects and normalize them print(image_path) image_path = Path(image_path).absolute().as_posix() audio_path = Path(audio_path).absolute().as_posix() # Parse the arguments args = create_args( image_path=image_path, audio_path=audio_path, prompt=prompt, output_dir=str(output_dir), audio_weight=audio_weight, prompt_cfg_scale=prompt_cfg_scale, audio_cfg_scale=audio_cfg_scale, image_size=image_size, max_num_frames=max_num_frames, inference_steps=inference_steps, seed=seed, ) try: global pipe, fantasytalking, wav2vec_processor, wav2vec if pipe is None: pipe,fantasytalking,wav2vec_processor,wav2vec = load_models(args) output_path=main( args,pipe,fantasytalking,wav2vec_processor,wav2vec ) return output_path # Ensure the output path is returned except Exception as e: print(f"Error during processing: {str(e)}") raise gr.Error(f"Error during processing: {str(e)}") def create_args( image_path: str, audio_path: str, prompt: str, output_dir: str, audio_weight: float, prompt_cfg_scale: float, audio_cfg_scale: float, image_size: int, max_num_frames: int, inference_steps: int, seed: int, ) -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--wan_model_dir", type=str, default="./models/Wan2.1-I2V-14B-720P", required=False, help="The dir of the Wan I2V 14B model.", ) parser.add_argument( "--fantasytalking_model_path", type=str, default="./models/fantasytalking_model.ckpt", required=False, help="The .ckpt path of fantasytalking model.", ) parser.add_argument( "--wav2vec_model_dir", type=str, default="./models/wav2vec2-base-960h", required=False, help="The dir of wav2vec model.", ) parser.add_argument( "--image_path", type=str, default="./assets/images/woman.png", required=False, help="The path of the image.", ) parser.add_argument( "--audio_path", type=str, default="./assets/audios/woman.wav", required=False, help="The path of the audio.", ) parser.add_argument( "--prompt", type=str, default="A woman is talking.", required=False, help="prompt.", ) parser.add_argument( "--output_dir", type=str, default="./output", help="Dir to save the video.", ) parser.add_argument( "--image_size", type=int, default=512, help="The image will be resized proportionally to this size.", ) parser.add_argument( "--audio_scale", type=float, default=1.0, help="Image width.", ) parser.add_argument( "--prompt_cfg_scale", type=float, default=5.0, required=False, help="prompt cfg scale", ) parser.add_argument( "--audio_cfg_scale", type=float, default=5.0, required=False, help="audio cfg scale", ) parser.add_argument( "--max_num_frames", type=int, default=81, required=False, help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated.", ) parser.add_argument( "--inference_steps", type=int, default=20, required=False, ) parser.add_argument( "--fps", type=int, default=23, required=False, ) parser.add_argument( "--num_persistent_param_in_dit", type=int, default=None, required=False, help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required" ) parser.add_argument( "--seed", type=int, default=1111, required=False, ) args = parser.parse_args( [ "--image_path", image_path, "--audio_path", audio_path, "--prompt", prompt, "--output_dir", output_dir, "--image_size", str(image_size), "--audio_scale", str(audio_weight), "--prompt_cfg_scale", str(prompt_cfg_scale), "--audio_cfg_scale", str(audio_cfg_scale), "--max_num_frames", str(max_num_frames), "--inference_steps", str(inference_steps), "--seed", str(seed), ] ) print(args) return args # Create Gradio interface with gr.Blocks(title="FantasyTalking Video Generation") as demo: gr.Markdown( """ # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis