Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,712 Bytes
178f950 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import json
import os
from typing import *
import numpy as np
import torch
import utils3d.torch
from .components import StandardDatasetBase, TextConditionedMixin, ImageConditionedMixin
from ..modules.sparse.basic import SparseTensor
from .. import models
from ..utils.render_utils import get_renderer
from ..utils.dist_utils import read_file_dist
from ..utils.data_utils import load_balanced_group_indices
class SLatVisMixin:
def __init__(
self,
*args,
pretrained_slat_dec: str = 'JeffreyXiang/TRELLIS-image-large/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16',
slat_dec_path: Optional[str] = None,
slat_dec_ckpt: Optional[str] = None,
**kwargs
):
super().__init__(*args, **kwargs)
self.slat_dec = None
self.pretrained_slat_dec = pretrained_slat_dec
self.slat_dec_path = slat_dec_path
self.slat_dec_ckpt = slat_dec_ckpt
def _loading_slat_dec(self):
if self.slat_dec is not None:
return
if self.slat_dec_path is not None:
cfg = json.load(open(os.path.join(self.slat_dec_path, 'config.json'), 'r'))
decoder = getattr(models, cfg['models']['decoder']['name'])(**cfg['models']['decoder']['args'])
ckpt_path = os.path.join(self.slat_dec_path, 'ckpts', f'decoder_{self.slat_dec_ckpt}.pt')
decoder.load_state_dict(torch.load(read_file_dist(ckpt_path), map_location='cpu', weights_only=True))
else:
decoder = models.from_pretrained(self.pretrained_slat_dec)
self.slat_dec = decoder.cuda().eval()
def _delete_slat_dec(self):
del self.slat_dec
self.slat_dec = None
@torch.no_grad()
def decode_latent(self, z, batch_size=4):
self._loading_slat_dec()
reps = []
if self.normalization is not None:
z = z * self.std.to(z.device) + self.mean.to(z.device)
for i in range(0, z.shape[0], batch_size):
reps.append(self.slat_dec(z[i:i+batch_size]))
reps = sum(reps, [])
self._delete_slat_dec()
return reps
@torch.no_grad()
def visualize_sample(self, x_0: Union[SparseTensor, dict]):
x_0 = x_0 if isinstance(x_0, SparseTensor) else x_0['x_0']
reps = self.decode_latent(x_0.cuda())
# Build camera
yaws = [0, np.pi / 2, np.pi, 3 * np.pi / 2]
yaws_offset = np.random.uniform(-np.pi / 4, np.pi / 4)
yaws = [y + yaws_offset for y in yaws]
pitch = [np.random.uniform(-np.pi / 4, np.pi / 4) for _ in range(4)]
exts = []
ints = []
for yaw, pitch in zip(yaws, pitch):
orig = torch.tensor([
np.sin(yaw) * np.cos(pitch),
np.cos(yaw) * np.cos(pitch),
np.sin(pitch),
]).float().cuda() * 2
fov = torch.deg2rad(torch.tensor(40)).cuda()
extrinsics = utils3d.torch.extrinsics_look_at(orig, torch.tensor([0, 0, 0]).float().cuda(), torch.tensor([0, 0, 1]).float().cuda())
intrinsics = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
exts.append(extrinsics)
ints.append(intrinsics)
renderer = get_renderer(reps[0])
images = []
for representation in reps:
image = torch.zeros(3, 1024, 1024).cuda()
tile = [2, 2]
for j, (ext, intr) in enumerate(zip(exts, ints)):
res = renderer.render(representation, ext, intr)
image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
images.append(image)
images = torch.stack(images)
return images
class SLat(SLatVisMixin, StandardDatasetBase):
"""
structured latent dataset
Args:
roots (str): path to the dataset
latent_model (str): name of the latent model
min_aesthetic_score (float): minimum aesthetic score
max_num_voxels (int): maximum number of voxels
normalization (dict): normalization stats
pretrained_slat_dec (str): name of the pretrained slat decoder
slat_dec_path (str): path to the slat decoder, if given, will override the pretrained_slat_dec
slat_dec_ckpt (str): name of the slat decoder checkpoint
"""
def __init__(self,
roots: str,
*,
latent_model: str,
min_aesthetic_score: float = 5.0,
max_num_voxels: int = 32768,
normalization: Optional[dict] = None,
pretrained_slat_dec: str = 'JeffreyXiang/TRELLIS-image-large/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16',
slat_dec_path: Optional[str] = None,
slat_dec_ckpt: Optional[str] = None,
):
self.normalization = normalization
self.latent_model = latent_model
self.min_aesthetic_score = min_aesthetic_score
self.max_num_voxels = max_num_voxels
self.value_range = (0, 1)
super().__init__(
roots,
pretrained_slat_dec=pretrained_slat_dec,
slat_dec_path=slat_dec_path,
slat_dec_ckpt=slat_dec_ckpt,
)
self.loads = [self.metadata.loc[sha256, 'num_voxels'] for _, sha256 in self.instances]
if self.normalization is not None:
self.mean = torch.tensor(self.normalization['mean']).reshape(1, -1)
self.std = torch.tensor(self.normalization['std']).reshape(1, -1)
def filter_metadata(self, metadata):
stats = {}
metadata = metadata[metadata[f'latent_{self.latent_model}']]
stats['With latent'] = len(metadata)
metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
metadata = metadata[metadata['num_voxels'] <= self.max_num_voxels]
stats[f'Num voxels <= {self.max_num_voxels}'] = len(metadata)
return metadata, stats
def get_instance(self, root, instance):
data = np.load(os.path.join(root, 'latents', self.latent_model, f'{instance}.npz'))
coords = torch.tensor(data['coords']).int()
feats = torch.tensor(data['feats']).float()
if self.normalization is not None:
feats = (feats - self.mean) / self.std
return {
'coords': coords,
'feats': feats,
}
@staticmethod
def collate_fn(batch, split_size=None):
if split_size is None:
group_idx = [list(range(len(batch)))]
else:
group_idx = load_balanced_group_indices([b['coords'].shape[0] for b in batch], split_size)
packs = []
for group in group_idx:
sub_batch = [batch[i] for i in group]
pack = {}
coords = []
feats = []
layout = []
start = 0
for i, b in enumerate(sub_batch):
coords.append(torch.cat([torch.full((b['coords'].shape[0], 1), i, dtype=torch.int32), b['coords']], dim=-1))
feats.append(b['feats'])
layout.append(slice(start, start + b['coords'].shape[0]))
start += b['coords'].shape[0]
coords = torch.cat(coords)
feats = torch.cat(feats)
pack['x_0'] = SparseTensor(
coords=coords,
feats=feats,
)
pack['x_0']._shape = torch.Size([len(group), *sub_batch[0]['feats'].shape[1:]])
pack['x_0'].register_spatial_cache('layout', layout)
# collate other data
keys = [k for k in sub_batch[0].keys() if k not in ['coords', 'feats']]
for k in keys:
if isinstance(sub_batch[0][k], torch.Tensor):
pack[k] = torch.stack([b[k] for b in sub_batch])
elif isinstance(sub_batch[0][k], list):
pack[k] = sum([b[k] for b in sub_batch], [])
else:
pack[k] = [b[k] for b in sub_batch]
packs.append(pack)
if split_size is None:
return packs[0]
return packs
class TextConditionedSLat(TextConditionedMixin, SLat):
"""
Text conditioned structured latent dataset
"""
pass
class ImageConditionedSLat(ImageConditionedMixin, SLat):
"""
Image conditioned structured latent dataset
"""
pass
|