Spaces:

Doubiiu
/

TrajectoryCrafter

Running on Zero

App Files Files Community

TrajectoryCrafter / extern /depthcrafter /infer.py

TrajectoryCrafter

update

0f56e8b 2 months ago

raw

history blame contribute delete

3.2 kB

	import gc
	import os
	import numpy as np
	import torch

	from diffusers.training_utils import set_seed
	from extern.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
	from extern.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter


	class DepthCrafterDemo:
	def __init__(
	self,
	unet_path: str,
	pre_train_path: str,
	cpu_offload: str = "model",
	device: str = "cuda:0"
	):
	unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
	unet_path,
	low_cpu_mem_usage=True,
	torch_dtype=torch.float16,
	)
	# load weights of other components from the provided checkpoint
	self.pipe = DepthCrafterPipeline.from_pretrained(
	pre_train_path,
	unet=unet,
	torch_dtype=torch.float16,
	variant="fp16",
	)

	# for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
	if cpu_offload is not None:
	if cpu_offload == "sequential":
	# This will slow, but save more memory
	self.pipe.enable_sequential_cpu_offload()
	elif cpu_offload == "model":
	self.pipe.enable_model_cpu_offload()
	else:
	raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
	else:
	self.pipe.to(device)
	# enable attention slicing and xformers memory efficient attention
	try:
	self.pipe.enable_xformers_memory_efficient_attention()
	except Exception as e:
	print(e)
	print("Xformers is not enabled")
	self.pipe.enable_attention_slicing()

	def infer(
	self,
	frames,
	near,
	far,
	num_denoising_steps: int,
	guidance_scale: float,
	window_size: int = 110,
	overlap: int = 25,
	seed: int = 42,
	track_time: bool = True,
	):
	set_seed(seed)

	# inference the depth map using the DepthCrafter pipeline
	with torch.inference_mode():
	res = self.pipe(
	frames,
	height=frames.shape[1],
	width=frames.shape[2],
	output_type="np",
	guidance_scale=guidance_scale,
	num_inference_steps=num_denoising_steps,
	window_size=window_size,
	overlap=overlap,
	track_time=track_time,
	).frames[0]
	# convert the three-channel output to a single channel depth map
	res = res.sum(-1) / res.shape[-1]
	# normalize the depth map to [0, 1] across the whole video
	depths = (res - res.min()) / (res.max() - res.min())
	# visualize the depth map and save the results
	# vis = vis_sequence_depth(res)
	# save the depth map and visualization with the target FPS
	depths = torch.from_numpy(depths).unsqueeze(1) # 49 576 1024 ->
	depths *= 3900 # compatible with da output
	depths[depths < 1e-5] = 1e-5
	depths = 10000. / depths
	depths = depths.clip(near, far)

	return depths