Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,202 Bytes
0f56e8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gc
import os
import numpy as np
import torch
from diffusers.training_utils import set_seed
from extern.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
from extern.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
class DepthCrafterDemo:
def __init__(
self,
unet_path: str,
pre_train_path: str,
cpu_offload: str = "model",
device: str = "cuda:0"
):
unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
unet_path,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
)
# load weights of other components from the provided checkpoint
self.pipe = DepthCrafterPipeline.from_pretrained(
pre_train_path,
unet=unet,
torch_dtype=torch.float16,
variant="fp16",
)
# for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
if cpu_offload is not None:
if cpu_offload == "sequential":
# This will slow, but save more memory
self.pipe.enable_sequential_cpu_offload()
elif cpu_offload == "model":
self.pipe.enable_model_cpu_offload()
else:
raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
else:
self.pipe.to(device)
# enable attention slicing and xformers memory efficient attention
try:
self.pipe.enable_xformers_memory_efficient_attention()
except Exception as e:
print(e)
print("Xformers is not enabled")
self.pipe.enable_attention_slicing()
def infer(
self,
frames,
near,
far,
num_denoising_steps: int,
guidance_scale: float,
window_size: int = 110,
overlap: int = 25,
seed: int = 42,
track_time: bool = True,
):
set_seed(seed)
# inference the depth map using the DepthCrafter pipeline
with torch.inference_mode():
res = self.pipe(
frames,
height=frames.shape[1],
width=frames.shape[2],
output_type="np",
guidance_scale=guidance_scale,
num_inference_steps=num_denoising_steps,
window_size=window_size,
overlap=overlap,
track_time=track_time,
).frames[0]
# convert the three-channel output to a single channel depth map
res = res.sum(-1) / res.shape[-1]
# normalize the depth map to [0, 1] across the whole video
depths = (res - res.min()) / (res.max() - res.min())
# visualize the depth map and save the results
# vis = vis_sequence_depth(res)
# save the depth map and visualization with the target FPS
depths = torch.from_numpy(depths).unsqueeze(1) # 49 576 1024 ->
depths *= 3900 # compatible with da output
depths[depths < 1e-5] = 1e-5
depths = 10000. / depths
depths = depths.clip(near, far)
return depths |