Spaces:

Hyathi
/

SoundImage-LipSync

Sleeping

App Files Files Community

samarth-ht commited on Jan 30

Commit

21c3587

1 Parent(s): 229302a

mask dropdown added

Browse files

Files changed (5) hide show

app.py +24 -3
scripts/inference.py +1 -0
soundimage/pipelines/lipsync_pipeline.py +2 -1
soundimage/utils/image_processor.py +4 -4
soundimage/utils/mask.png +0 -3

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ def process_video(
     inference_steps,
     seed,
     checkpoint_file,
 ):
     # Create the temp directory if it doesn't exist
     output_dir = Path("./temp")
@@ -26,6 +27,9 @@ def process_video(
     # Use selected checkpoint or fall back to default
     checkpoint_path = Path("checkpoints/unetFiles") / checkpoint_file if checkpoint_file else CHECKPOINT_PATH
     # Convert paths to absolute Path objects and normalize them
     video_file_path = Path(video_path)
@@ -48,7 +52,7 @@ def process_video(
     )
     # Parse the arguments
-    args = create_args(video_path, audio_path, output_path, guidance_scale, seed, checkpoint_path)
     try:
         result = main(
@@ -63,7 +67,8 @@ def process_video(
 def create_args(
-    video_path: str, audio_path: str, output_path: str, guidance_scale: float, seed: int, checkpoint_path: Path
 ) -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument("--inference_ckpt_path", type=str, required=True)
@@ -72,6 +77,7 @@ def create_args(
     parser.add_argument("--video_out_path", type=str, required=True)
     parser.add_argument("--guidance_scale", type=float, default=1.0)
     parser.add_argument("--seed", type=int, default=1247)
     return parser.parse_args(
         [
@@ -87,6 +93,8 @@ def create_args(
             str(guidance_scale),
             "--seed",
             str(seed),
         ]
     )
@@ -97,6 +105,13 @@ def get_checkpoint_files():
         return []
     return [f.name for f in unet_files_dir.glob("*.pt")]
 # Create Gradio interface
 with gr.Blocks(title="SoundImage") as demo:
     gr.Markdown(
@@ -109,12 +124,17 @@ with gr.Blocks(title="SoundImage") as demo:
     with gr.Row():
         with gr.Column():
-            # Add checkpoint selector dropdown
             checkpoint_dropdown = gr.Dropdown(
                 choices=get_checkpoint_files(),
                 label="Select Checkpoint",
                 value=get_checkpoint_files()[0] if get_checkpoint_files() else None
             )
             video_input = gr.Video(label="Input Video")
             audio_input = gr.Audio(label="Input Audio", type="filepath")
@@ -156,6 +176,7 @@ with gr.Blocks(title="SoundImage") as demo:
             inference_steps,
             seed,
             checkpoint_dropdown,
         ],
         outputs=video_output,
     )

     inference_steps,
     seed,
     checkpoint_file,
+    mask_file,
 ):
     # Create the temp directory if it doesn't exist
     output_dir = Path("./temp")
     # Use selected checkpoint or fall back to default
     checkpoint_path = Path("checkpoints/unetFiles") / checkpoint_file if checkpoint_file else CHECKPOINT_PATH
+    # Get mask path
+    mask_path = Path("masks") / mask_file if mask_file else None
     # Convert paths to absolute Path objects and normalize them
     video_file_path = Path(video_path)
     )
     # Parse the arguments
+    args = create_args(video_path, audio_path, output_path, guidance_scale, seed, checkpoint_path, mask_path)
     try:
         result = main(
 def create_args(
+    video_path: str, audio_path: str, output_path: str, guidance_scale: float, seed: int,
+    checkpoint_path: Path, mask_path: Path
 ) -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument("--inference_ckpt_path", type=str, required=True)
     parser.add_argument("--video_out_path", type=str, required=True)
     parser.add_argument("--guidance_scale", type=float, default=1.0)
     parser.add_argument("--seed", type=int, default=1247)
+    parser.add_argument("--mask_path", type=str, required=False)
     return parser.parse_args(
         [
             str(guidance_scale),
             "--seed",
             str(seed),
+            "--mask_path",
+            mask_path.absolute().as_posix() if mask_path else "",
         ]
     )
         return []
     return [f.name for f in unet_files_dir.glob("*.pt")]
+# Add this function to get mask files
+def get_mask_files():
+    masks_dir = Path("masks")
+    if not masks_dir.exists():
+        return []
+    return [f.name for f in masks_dir.glob("*.png")]  # Assuming masks are PNG files
 # Create Gradio interface
 with gr.Blocks(title="SoundImage") as demo:
     gr.Markdown(
     with gr.Row():
         with gr.Column():
+            # Add checkpoint and mask selectors
             checkpoint_dropdown = gr.Dropdown(
                 choices=get_checkpoint_files(),
                 label="Select Checkpoint",
                 value=get_checkpoint_files()[0] if get_checkpoint_files() else None
             )
+            mask_dropdown = gr.Dropdown(  # New dropdown for masks
+                choices=get_mask_files(),
+                label="Select Mask",
+                value=get_mask_files()[0] if get_mask_files() else None
+            )
             video_input = gr.Video(label="Input Video")
             audio_input = gr.Audio(label="Input Audio", type="filepath")
             inference_steps,
             seed,
             checkpoint_dropdown,
+            mask_dropdown,  # Add mask_dropdown to inputs
         ],
         outputs=video_output,
     )

scripts/inference.py CHANGED Viewed

@@ -84,6 +84,7 @@ def main(config, args):
         weight_dtype=dtype,
         width=config.data.resolution,
         height=config.data.resolution,
     )

         weight_dtype=dtype,
         width=config.data.resolution,
         height=config.data.resolution,
+        mask_path=args.mask_path,
     )

soundimage/pipelines/lipsync_pipeline.py CHANGED Viewed

@@ -296,6 +296,7 @@ class LipsyncPipeline(DiffusionPipeline):
         audio_path: str,
         video_out_path: str,
         video_mask_path: str = None,
         num_frames: int = 16,
         video_fps: int = 25,
         audio_sample_rate: int = 16000,
@@ -317,7 +318,7 @@ class LipsyncPipeline(DiffusionPipeline):
         # 0. Define call parameters
         batch_size = 1
         device = self._execution_device
-        self.image_processor = ImageProcessor(height, mask=mask, device="cuda")
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         video_frames, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)

         audio_path: str,
         video_out_path: str,
         video_mask_path: str = None,
+        mask_path: str = None,
         num_frames: int = 16,
         video_fps: int = 25,
         audio_sample_rate: int = 16000,
         # 0. Define call parameters
         batch_size = 1
         device = self._execution_device
+        self.image_processor = ImageProcessor(height, mask=mask, device="cuda", mask_image=mask_path)
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         video_frames, original_video_frames, boxes, affine_matrices = self.affine_transform_video(video_path)

soundimage/utils/image_processor.py CHANGED Viewed

@@ -28,8 +28,8 @@ https://stackoverflow.com/questions/23853632/which-kind-of-interpolation-best-fo
 """
-def load_fixed_mask(resolution: int) -> torch.Tensor:
-    mask_image = cv2.imread("soundimage/utils/mask.png")
     mask_image = cv2.cvtColor(mask_image, cv2.COLOR_BGR2RGB)
     mask_image = cv2.resize(mask_image, (resolution, resolution), interpolation=cv2.INTER_AREA) / 255.0
     mask_image = rearrange(torch.from_numpy(mask_image), "h w c -> c h w")
@@ -37,7 +37,7 @@ def load_fixed_mask(resolution: int) -> torch.Tensor:
 class ImageProcessor:
-    def __init__(self, resolution: int = 512, mask: str = "fix_mask", device: str = "cpu", mask_image=None):
         self.resolution = resolution
         self.resize = transforms.Resize(
             (resolution, resolution), interpolation=transforms.InterpolationMode.BILINEAR, antialias=True
@@ -53,7 +53,7 @@ class ImageProcessor:
             self.restorer = AlignRestore()
             if mask_image is None:
-                self.mask_image = load_fixed_mask(resolution)
             else:
                 self.mask_image = mask_image

 """
+def load_fixed_mask(resolution: int, mask_path: str) -> torch.Tensor:
+    mask_image = cv2.imread(mask_path)
     mask_image = cv2.cvtColor(mask_image, cv2.COLOR_BGR2RGB)
     mask_image = cv2.resize(mask_image, (resolution, resolution), interpolation=cv2.INTER_AREA) / 255.0
     mask_image = rearrange(torch.from_numpy(mask_image), "h w c -> c h w")
 class ImageProcessor:
+    def __init__(self, resolution: int = 512, mask: str = "fix_mask", device: str = "cpu", mask_image=None, mask_path=None):
         self.resolution = resolution
         self.resize = transforms.Resize(
             (resolution, resolution), interpolation=transforms.InterpolationMode.BILINEAR, antialias=True
             self.restorer = AlignRestore()
             if mask_image is None:
+                self.mask_image = load_fixed_mask(resolution, mask_path)
             else:
                 self.mask_image = mask_image

soundimage/utils/mask.png DELETED Viewed

Git LFS Details

SHA256: aa233251b9ff5691a1565a4108f0910ab1e5e7ad79a7bb2b741ab4d92c81053c
Pointer size: 129 Bytes
Size of remote file: 1.87 kB