Spaces:

yuyutsu07
/

Pseudo3D

Running on Zero

File size: 10,239 Bytes

2552c4b
aef98ce
 
 
 
16f4e59
aef98ce
 
16f4e59
c1d8b1e
 
 
18e62f1
 
 
 
c1d8b1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aef98ce
c1d8b1e
 
18e62f1
6c8993e
18e62f1
6c8993e
c1d8b1e
16f4e59
 
6c8993e
 
18e62f1
 
 
 
6c8993e
18e62f1
16f4e59
 
 
 
 
 
18e62f1
 
 
 
 
c1d8b1e
 
 
18e62f1
6c8993e
 
16f4e59
6c8993e
18e62f1
 
c1d8b1e
 
 
 
 
18e62f1
c1d8b1e
 
18e62f1
 
c1d8b1e
 
18e62f1
 
 
c1d8b1e
 
18e62f1
 
 
 
c1d8b1e
 
 
 
 
 
18e62f1
 
 
 
 
 
 
 
 
 
 
c1d8b1e
18e62f1
 
c1d8b1e
18e62f1
 
 
 
c1d8b1e
 
 
 
 
18e62f1
 
 
c1d8b1e
 
 
 
 
 
 
 
 
18e62f1
c1d8b1e
 
18e62f1
 
 
 
 
 
6c8993e
d494365
 
18e62f1
 
 
 
6c8993e
 
aef98ce
 
18e62f1
 
 
c1d8b1e
 
 
18e62f1
aef98ce
18e62f1
 
d494365
18e62f1
c1d8b1e
 
 
 
 
18e62f1
 
 
 
c1d8b1e
 
18e62f1
 
 
c1d8b1e
 
 
 
 
 
 
 
 
 
 
18e62f1
 
 
 
 
 
c1d8b1e
 
 
 
 
18e62f1
 
aef98ce

import torch
import gradio as gr
import imageio
import numpy as np
from PIL import Image
from torchvision.transforms import ToTensor, Resize
import spaces
import tempfile
from scipy.ndimage import gaussian_filter
from aura_sr import AuraSR
import cv2
import torch.nn.functional as F

# Load AuraSR-v2 model once at startup
aura_sr = AuraSR.from_pretrained("fal/AuraSR-v2")

# Post-processing functions
def apply_lens_distortion(image, k1=0.2):
    """Apply lens distortion using OpenCV."""
    h, w = image.shape[:2]
    camera_matrix = np.array([[w, 0, w / 2], [0, w, h / 2], [0, 0, 1]], dtype=np.float32)
    dist_coeffs = np.array([k1, 0, 0, 0], dtype=np.float32)
    distorted = cv2.undistort(image, camera_matrix, dist_coeffs)
    return distorted

def apply_depth_of_field(image_tensor, depth_tensor, focus_depth=0.5, blur_size=5):
    """Apply depth of field blur using PyTorch."""
    depth_diff = torch.abs(depth_tensor - focus_depth)
    blur_kernel = blur_size * depth_diff.clamp(0, 1)
    blur_kernel = blur_kernel.unsqueeze(0).unsqueeze(0)
    padded_image = F.pad(image_tensor, (blur_size // 2, blur_size // 2, blur_size // 2, blur_size // 2), mode='reflect')
    blurred = F.conv2d(padded_image, torch.ones(1, 1, blur_size, blur_size, device='cuda') / (blur_size ** 2), groups=3)
    mask = (depth_diff < 0.1).float()
    return image_tensor * mask + blurred * (1 - mask)

def apply_vignette(image):
    """Apply vignette effect."""
    h, w = image.shape[:2]
    x, y = np.meshgrid(np.arange(w), np.arange(h))
    center_x, center_y = w / 2, h / 2
    radius = np.sqrt((x - center_x) ** 2 + (y - center_y) ** 2)
    max_radius = np.sqrt(center_x ** 2 + center_y ** 2)
    vignette = 1 - (radius / max_radius) ** 2
    vignette = np.clip(vignette, 0, 1)
    return (image * vignette[..., np.newaxis]).astype(np.uint8)

def parse_keyframes(keyframe_text):
    """Parse keyframe text into time-position pairs."""
    keyframes = []
    try:
        for entry in keyframe_text.split():
            time, pos = entry.split(':')
            x, y = map(float, pos.split(','))
            keyframes.append((float(time), x, y))
        keyframes.sort()  # Sort by time
        return keyframes
    except:
        return [(0, 0, 0), (1, 0, 0)]  # Default fallback

def interpolate_keyframes(t, keyframes):
    """Interpolate camera position between keyframes."""
    if t <= keyframes[0][0]:
        return keyframes[0][1], keyframes[0][2]
    if t >= keyframes[-1][0]:
        return keyframes[-1][1], keyframes[-1][2]
    for i in range(len(keyframes) - 1):
        t1, x1, y1 = keyframes[i]
        t2, x2, y2 = keyframes[i + 1]
        if t1 <= t <= t2:
            alpha = (t - t1) / (t2 - t1)
            return x1 + alpha * (x2 - x1), y1 + alpha * (y2 - y1)
    return 0, 0  # Fallback

@spaces.GPU
def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps, duration, ssaa_factor, use_taa, use_upscale, apply_lens, apply_dof, apply_vig, keyframe_text):
    """Generate a 3D parallax video with advanced features."""
    # Validate input dimensions
    if image.size != depth_map.size:
        raise ValueError("Image and depth map must have the same dimensions")

    # Convert to tensors
    image_tensor = ToTensor()(image).to('cuda', dtype=torch.float32)
    depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda', dtype=torch.float32)
    depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)

    # Smooth depth map
    depth_np = depth_tensor.squeeze().cpu().numpy()
    depth_np = gaussian_filter(depth_np, sigma=1)
    depth_tensor = torch.tensor(depth_np, device='cuda', dtype=torch.float32).unsqueeze(0)

    # Apply SSAA
    if ssaa_factor > 1:
        upscale = Resize((int(image.height * ssaa_factor), int(image.width * ssaa_factor)), antialias=True)
        image_tensor = upscale(image_tensor)
        depth_tensor = upscale(depth_tensor)

    H, W = image_tensor.shape[1], image_tensor.shape[2]
    x = torch.arange(0, W).float().to('cuda')
    y = torch.arange(0, H).float().to('cuda')
    xx, yy = torch.meshgrid(x, y, indexing='xy')
    pixel_grid = torch.stack((xx, yy), dim=-1)

    # Parse keyframes for custom path
    keyframes = parse_keyframes(keyframe_text) if animation_style == "custom" else None

    # Generate frames
    num_frames = int(fps * duration)
    frames = []
    prev_frame = None

    for frame in range(num_frames):
        t = frame / num_frames
        if animation_style == "zoom":
            zoom_factor = 1 + amplitude * np.sin(2 * np.pi * t)
            displacement_x = (pixel_grid[:, :, 0] - W / 2) * (1 - zoom_factor) * depth_tensor.squeeze()
            displacement_y = (pixel_grid[:, :, 1] - H / 2) * (1 - zoom_factor) * depth_tensor.squeeze()
        elif animation_style == "horizontal":
            camera_x = amplitude * np.sin(2 * np.pi * t)
            displacement_x = k * camera_x * depth_tensor.squeeze()
            displacement_y = 0
        elif animation_style == "vertical":
            camera_y = amplitude * np.sin(2 * np.pi * t)
            displacement_x = 0
            displacement_y = k * camera_y * depth_tensor.squeeze()
        elif animation_style == "circle":
            camera_x = amplitude * np.sin(2 * np.pi * t)
            camera_y = amplitude * np.cos(2 * np.pi * t)
            displacement_x = k * camera_x * depth_tensor.squeeze()
            displacement_y = k * camera_y * depth_tensor.squeeze()
        elif animation_style == "spiral":
            radius = amplitude * (1 - t)
            camera_x = radius * np.sin(4 * np.pi * t)
            camera_y = radius * np.cos(4 * np.pi * t)
            displacement_x = k * camera_x * depth_tensor.squeeze()
            displacement_y = k * camera_y * depth_tensor.squeeze()
        elif animation_style == "custom":
            camera_x, camera_y = interpolate_keyframes(t, keyframes)
            displacement_x = k * camera_x * depth_tensor.squeeze()
            displacement_y = k * camera_y * depth_tensor.squeeze()
        else:
            raise ValueError(f"Unsupported animation style: {animation_style}")

        source_pixel_x = pixel_grid[:, :, 0] + displacement_x
        source_pixel_y = pixel_grid[:, :, 1] + displacement_y

        # Normalize to [-1, 1]
        grid_x = 2 * source_pixel_x / (W - 1) - 1
        grid_y = 2 * source_pixel_y / (H - 1) - 1
        grid = torch.stack((grid_x, grid_y), dim=-1).unsqueeze(0)

        # Warp image
        warped = torch.nn.functional.grid_sample(image_tensor.unsqueeze(0), grid, mode='bicubic', align_corners=True)

        # Downsample if SSAA
        if ssaa_factor > 1:
            downscale = Resize((image.height, image.width), antialias=True)
            warped = downscale(warped.squeeze(0)).unsqueeze(0)

        # Apply depth of field if enabled
        if apply_dof:
            warped = apply_depth_of_field(warped.squeeze(0), depth_tensor.squeeze(0))

        # Convert to numpy
        frame_img = warped.squeeze(0).permute(1, 2, 0).cpu().numpy()
        frame_img = (frame_img * 255).astype(np.uint8)

        # Apply lens distortion if enabled
        if apply_lens:
            frame_img = apply_lens_distortion(frame_img)

        # Apply vignette if enabled
        if apply_vig:
            frame_img = apply_vignette(frame_img)

        # Apply upscaling if enabled
        if use_upscale:
            frame_pil = Image.fromarray(frame_img)
            frame_pil = aura_sr.upscale_4x_overlapped(frame_pil)
            frame_img = np.array(frame_pil)

        # Apply TAA if enabled
        if use_taa and prev_frame is not None:
            frame_img = (frame_img * 0.8 + prev_frame * 0.2).astype(np.uint8)

        frames.append(frame_img)
        prev_frame = frame_img.copy() if use_taa else None

    # Save video
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
        output_path = tmpfile.name
        writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
        for frame in frames:
            writer.append_data(frame)
        writer.close()

    return output_path

# Gradio interface
with gr.Blocks(title="Ultimate 3D Parallax Video Generator") as demo:
    gr.Markdown("# Ultimate 3D Parallax Video Generator")
    gr.Markdown("Generate high-quality 3D parallax videos with advanced features, post-processing, and custom paths.")

    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload Image")
        depth_input = gr.Image(type="pil", label="Upload Depth Map")

    with gr.Row():
        animation_style = gr.Dropdown(
            ["zoom", "horizontal", "vertical", "circle", "spiral", "custom"],
            label="Animation Style",
            value="horizontal"
        )
        amplitude_slider = gr.Slider(0, 10, value=2, label="Amplitude", step=0.1)
        k_slider = gr.Slider(1, 20, value=5, label="Depth Scale (k)", step=0.1)
        fps_slider = gr.Slider(10, 60, value=30, label="FPS", step=1)
        duration_slider = gr.Slider(1, 10, value=5, label="Duration (s)", step=0.1)

    with gr.Row():
        ssaa_factor = gr.Dropdown([1, 2, 4], label="SSAA Factor", value=1)
        use_taa = gr.Checkbox(label="Enable TAA", value=False)
        use_upscale = gr.Checkbox(label="Enable AuraSR-v2 Upscaling", value=False)
        apply_lens = gr.Checkbox(label="Apply Lens Distortion", value=False)
        apply_dof = gr.Checkbox(label="Apply Depth of Field", value=False)
        apply_vig = gr.Checkbox(label="Apply Vignette", value=False)

    with gr.Row():
        keyframe_text = gr.Textbox(
            label="Custom Keyframes (time:x,y)",
            value="0:0,0 0.5:5,0 1:0,0",
            placeholder="e.g., 0:0,0 0.5:5,0 1:0,0",
            visible=True
        )

    generate_btn = gr.Button("Generate Video")
    video_output = gr.Video(label="Parallax Video")

    generate_btn.click(
        fn=generate_parallax_video,
        inputs=[
            image_input, depth_input, animation_style, amplitude_slider, k_slider,
            fps_slider, duration_slider, ssaa_factor, use_taa, use_upscale,
            apply_lens, apply_dof, apply_vig, keyframe_text
        ],
        outputs=video_output
    )

demo.launch()