Spaces:

zerchen
/

HORT

Running on Zero

App Files Files Community

zerchen commited on 27 days ago

Commit

717b269

1 Parent(s): fe0ef0e

init test without models

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +139 -0
app.py +202 -0
assets/test1.png +3 -0
assets/test2.png +3 -0
assets/test3.jpg +0 -0
assets/test4.jpeg +0 -0
assets/test5.jpeg +0 -0
hort/models/__init__.py +114 -0
hort/models/network/pointnet.py +36 -0
hort/models/tgs/__init__.py +9 -0
hort/models/tgs/data.py +265 -0
hort/models/tgs/models/__init__.py +0 -0
hort/models/tgs/models/image_feature.py +48 -0
hort/models/tgs/models/networks.py +204 -0
hort/models/tgs/models/pointclouds/LICENSE_POINTNET +21 -0
hort/models/tgs/models/pointclouds/pointnet.py +121 -0
hort/models/tgs/models/pointclouds/simplepoint.py +110 -0
hort/models/tgs/models/renderer.py +427 -0
hort/models/tgs/models/snowflake/LICENSE +21 -0
hort/models/tgs/models/snowflake/SPD.py +68 -0
hort/models/tgs/models/snowflake/SPD_crossattn.py +81 -0
hort/models/tgs/models/snowflake/SPD_pp.py +71 -0
hort/models/tgs/models/snowflake/attention.py +239 -0
hort/models/tgs/models/snowflake/model_spdpp.py +239 -0
hort/models/tgs/models/snowflake/pointnet2.py +126 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/__init__.py +3 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/ball_query.h +5 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/cuda_utils.h +41 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/group_points.h +5 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/interpolate.h +10 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/sampling.h +6 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/utils.h +25 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/ball_query.cpp +32 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/ball_query_gpu.cu +54 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/bindings.cpp +19 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/group_points.cpp +62 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/group_points_gpu.cu +75 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/interpolate.cpp +99 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/interpolate_gpu.cu +154 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/sampling.cpp +87 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/sampling_gpu.cu +229 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_version.py +1 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/pointnet2_modules.py +209 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/pointnet2_utils.py +391 -0
hort/models/tgs/models/snowflake/pointnet2_ops_lib/setup.py +41 -0
hort/models/tgs/models/snowflake/skip_transformer.py +69 -0
hort/models/tgs/models/snowflake/utils.py +741 -0
hort/models/tgs/models/tokenizers/dinov2.py +1179 -0
hort/models/tgs/models/tokenizers/image.py +123 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/test1.png filter=lfs diff=lfs merge=lfs -text
+assets/test2.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,139 @@

+# Byte-compiled / optimized / DLL files
+__pycache__
+*.py[cod]
+*$py.class
+# pyc
+*.pyc
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# VSCode
+.vscode
+*.swp
+*.h5
+*.mp4

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import sys
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1"
+import gradio as gr
+#import spaces
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+from pathlib import Path
+import argparse
+import json
+from torchvision import transforms
+from typing import Dict, Optional
+from PIL import Image, ImageDraw
+from lang_sam import LangSAM
+from wilor.models import load_wilor
+from wilor.utils import recursive_to
+from wilor.datasets.vitdet_dataset import ViTDetDataset
+from hort.models import load_hort
+from hort.utils.renderer import Renderer, cam_crop_to_new
+from hort.utils.img_utils import process_bbox, generate_patch_image, PerspectiveCamera
+from ultralytics import YOLO
+LIGHT_PURPLE=(0.25098039,  0.274117647,  0.65882353)
+STEEL_BLUE=(0.2745098, 0.5098039, 0.7058824)
+# Download and load checkpoints
+wilor_model, wilor_model_cfg = load_wilor(checkpoint_path = './pretrained_models/wilor_final.ckpt' , cfg_path= './pretrained_models/model_config.yaml')
+hand_detector = YOLO('./pretrained_models/detector.pt')
+# Setup the renderer
+renderer = Renderer(wilor_model_cfg, faces=wilor_model.mano.faces)
+# Setup the SAM model
+sam_model = LangSAM(sam_type="sam2.1_hiera_large")
+# Setup the HORT model
+hort_model = load_hort("./pretrained_models/hort_final.pth.tar")
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+wilor_model = wilor_model.to(device)
+hand_detector = hand_detector.to(device)
+hort_model = hort_model.to(device)
+wilor_model.eval()
+hort_model.eval()
+image_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+@spaces.GPU()
+def run_model(image, conf, IoU_threshold=0.5):
+    img_cv2 = image[..., ::-1]
+    img_pil = Image.fromarray(image)
+    pred_obj = sam_model.predict([img_pil], ["manipulated object"])
+    pred_hand = sam_model.predict([img_pil], ["hand"])
+    bbox_obj = pred_obj[0]["boxes"][0].reshape((-1, 2))
+    mask_obj = pred_obj[0]["masks"][0]
+    bbox_hand = pred_hand[0]["boxes"][0].reshape((-1, 2))
+    mask_hand = pred_hand[0]["masks"][0]
+    tl = np.min(np.concatenate([bbox_obj, bbox_hand], axis=0), axis=0)
+    br = np.max(np.concatenate([bbox_obj, bbox_hand], axis=0), axis=0)
+    box_size = br - tl
+    bbox = np.concatenate([tl - 10, box_size + 20], axis=0)
+    ho_bbox = process_bbox(bbox)
+    detections = hand_detector(img_cv2, conf=conf, verbose=False, iou=IoU_threshold)[0]
+    bboxes = []
+    is_right = []
+    for det in detections:
+        Bbox = det.boxes.data.cpu().detach().squeeze().numpy()
+        is_right.append(det.boxes.cls.cpu().detach().squeeze().item())
+        bboxes.append(Bbox[:4].tolist())
+    if len(bboxes) == 1:
+        boxes = np.stack(bboxes)
+        right = np.stack(is_right)
+        if not right:
+            new_x1 = img_cv2.shape[1] - boxes[0][2]
+            new_x2 = img_cv2.shape[1] - boxes[0][0]
+            boxes[0][0] = new_x1
+            boxes[0][2] = new_x2
+            ho_bbox[0] = img_cv2.shape[1] - (ho_bbox[0] + ho_bbox[2])
+            img_cv2 = cv2.flip(img_cv2, 1)
+            right[0] = 1.
+        crop_img_cv2, _ = generate_patch_image(img_cv2, ho_bbox, (224, 224), 0, 1.0, 0)
+        dataset = ViTDetDataset(wilor_model_cfg, img_cv2, boxes, right, rescale_factor=2.0)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)
+        for batch in dataloader:
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out = wilor_model(batch)
+            pred_cam      = out['pred_cam']
+            box_center    = batch["box_center"].float()
+            box_size      = batch["box_size"].float()
+            img_size      = batch["img_size"].float()
+            scaled_focal_length = wilor_model_cfg.EXTRA.FOCAL_LENGTH / wilor_model_cfg.MODEL.IMAGE_SIZE * 224
+            pred_cam_t_full = cam_crop_to_new(pred_cam, box_center, box_size, img_size, torch.from_numpy(np.array(ho_bbox, dtype=np.float32))[None, :].to(img_size.device), scaled_focal_length).detach().cpu().numpy()
+            batch_size = batch['img'].shape[0]
+            for n in range(batch_size):
+                verts  = out['pred_vertices'][n].detach().cpu().numpy()
+                joints = out['pred_keypoints_3d'][n].detach().cpu().numpy()
+                is_right = batch['right'][n].cpu().numpy()
+                palm = (verts[95] + verts[22]) / 2
+                cam_t = pred_cam_t_full[n]
+                img_input = image_transform(crop_img_cv2[:, :, ::-1]).unsqueeze(0).cuda()
+                camera = PerspectiveCamera(5000 / 256 * 224, 5000 / 256 * 224, 112, 112)
+                cam_intr = camera.intrinsics
+                metas = dict()
+                metas["right_hand_verts_3d"] = torch.from_numpy((verts + cam_t)[None]).cuda()
+                metas["right_hand_joints_3d"] = torch.from_numpy((joints + cam_t)[None]).cuda()
+                metas["right_hand_palm"] = torch.from_numpy((palm + cam_t)[None]).cuda()
+                metas["cam_intr"] = torch.from_numpy(cam_intr[None]).cuda()
+                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                    pc_results = hort_model(img_input, metas)
+                objtrans = pc_results["objtrans"][0].detach().cpu().numpy()
+                pointclouds_up = pc_results["pointclouds_up"][0].detach().cpu().numpy() * 0.3
+                reconstructions = {'verts': verts, 'palm': palm, 'objtrans': objtrans, 'objpcs': pointclouds_up, 'cam_t': cam_t, 'right': is_right, 'img_size': 224, 'focal': scaled_focal_length}
+        return crop_img_cv2[..., ::-1].astype(np.float32) / 255.0, len(detections), reconstructions
+    else:
+        return crop_img_cv2[..., ::-1].astype(np.float32) / 255.0, len(detections), None
+def render_reconstruction(image, conf, IoU_threshold=0.3):
+    input_img, num_dets, reconstructions = run_model(image, conf, IoU_threshold=0.5)
+    if num_dets == 1:
+    # Render front view
+        misc_args = dict(mesh_base_color=LIGHT_PURPLE, point_base_color=STEEL_BLUE, scene_bg_color=(1, 1, 1), focal_length=reconstructions['focal'])
+        cam_view = renderer.render_rgba(reconstructions['verts'], reconstructions['objpcs'] + reconstructions['palm'] + reconstructions['objtrans'], cam_t=reconstructions['cam_t'], render_res=(224, 224), is_right=True, **misc_args)
+        # Overlay image
+        input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
+        input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]
+        return input_img_overlay, f'{num_dets} hands detected'
+    else:
+        return input_img, f'{num_dets} hands detected'
+header = ('''
+<div class="embed_hidden" style="text-align: center;">
+    <h1> <b>HORT</b>: Monocular Hand-held Objects Reconstruction with Transformers</h1>
+    <h3>
+        <a href="https://zerchen.github.io/" target="_blank" rel="noopener noreferrer">Zerui Chen</a><sup>1</sup>,
+        <a href="https://rolpotamias.github.io" target="_blank" rel="noopener noreferrer">Rolandos Alexandros Potamias</a><sup>2</sup>,
+        <br>
+        <a href="https://cshizhe.github.io/" target="_blank" rel="noopener noreferrer">Shizhe Chen</a><sup>1</sup>,
+        <a href="https://cordeliaschmid.github.io/" target="_blank" rel="noopener noreferrer">Cordelia Schmid</a><sup>1</sup>
+    </h3>
+    <h3>
+        <sup>1</sup>Inria, Ecole normale supérieure, CNRS, PSL Research University;
+        <sup>2</sup>Imperial College London
+    </h3>
+</div>
+<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
+<a href='https://arxiv.org/abs/2503.21313'><img src='https://img.shields.io/badge/Arxiv-2503.21313-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
+<a href='https://arxiv.org/pdf/2503.21313'><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
+<a href='https://zerchen.github.io/projects/hort.html'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
+<a href='https://github.com/zerchen/hort'><img src='https://img.shields.io/badge/GitHub-Code-black?style=flat&logo=github&logoColor=white'></a>
+''')
+with gr.Blocks(title="HORT: Monocular Hand-held Objects Reconstruction with Transformers", css=".gradio-container") as demo:
+    gr.Markdown(header)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Input image", type="numpy")
+            threshold = gr.Slider(value=0.3, minimum=0.05, maximum=0.95, step=0.05, label='Detection Confidence Threshold')
+            submit = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            reconstruction = gr.Image(label="Reconstructions", type="numpy")
+            hands_detected = gr.Textbox(label="Hands Detected")
+        submit.click(fn=render_reconstruction, inputs=[input_image, threshold], outputs=[reconstruction, hands_detected])
+    with gr.Row():
+        example_images = gr.Examples([
+            ['/home/user/app/assets/test1.png'],
+            ['./demo_img/app/assets/test2.png'],
+            ['./demo_img/app/assets/test3.jpg'],
+            ['./demo_img/app/assets/test4.jpeg'],
+            ['./demo_img/app/assets/test5.jpeg']
+            ],
+            inputs=input_image)
+demo.launch(debug=True)

assets/test1.png ADDED Viewed

Git LFS Details

SHA256: 220310a89f9777975b10d933eb6aef34c3fe036ae2f453c2e31d537f8827111b
Pointer size: 131 Bytes
Size of remote file: 130 kB

assets/test2.png ADDED Viewed

Git LFS Details

SHA256: 29e4602efe21a483442c42a50ebf1c666c9e525dc630ec801a5af1d3acee18b1
Pointer size: 131 Bytes
Size of remote file: 133 kB

assets/test3.jpg ADDED Viewed

assets/test4.jpeg ADDED Viewed

assets/test5.jpeg ADDED Viewed

hort/models/__init__.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+import sys
+import os.path as osp
+import numpy as np
+from yacs.config import CfgNode as CN
+this_dir = osp.dirname(__file__)
+sys.path.insert(0, this_dir)
+import tgs
+from network.pointnet import PointNetEncoder
+hort_cfg = CN()
+hort_cfg.image_tokenizer_cls = "tgs.models.tokenizers.image.DINOV2SingleImageTokenizer"
+hort_cfg.image_tokenizer = CN()
+hort_cfg.image_tokenizer.pretrained_model_name_or_path = "facebook/dinov2-large"
+hort_cfg.image_tokenizer.width = 224
+hort_cfg.image_tokenizer.height = 224
+hort_cfg.image_tokenizer.modulation = False
+hort_cfg.image_tokenizer.modulation_zero_init = True
+hort_cfg.image_tokenizer.modulation_cond_dim = 1024
+hort_cfg.image_tokenizer.freeze_backbone_params = False
+hort_cfg.image_tokenizer.enable_memory_efficient_attention = False
+hort_cfg.image_tokenizer.enable_gradient_checkpointing = False
+hort_cfg.tokenizer_cls = "tgs.models.tokenizers.point.PointLearnablePositionalEmbedding"
+hort_cfg.tokenizer = CN()
+hort_cfg.tokenizer.num_pcl = 2049
+hort_cfg.tokenizer.num_channels = 512
+hort_cfg.backbone_cls = "tgs.models.transformers.Transformer1D"
+hort_cfg.backbone = CN()
+hort_cfg.backbone.in_channels = 512
+hort_cfg.backbone.num_attention_heads = 8
+hort_cfg.backbone.attention_head_dim = 64
+hort_cfg.backbone.num_layers = 10
+hort_cfg.backbone.cross_attention_dim = 1024
+hort_cfg.backbone.norm_type = "layer_norm"
+hort_cfg.backbone.enable_memory_efficient_attention = False
+hort_cfg.backbone.gradient_checkpointing = False
+hort_cfg.post_processor_cls = "tgs.models.networks.PointOutLayer"
+hort_cfg.post_processor = CN()
+hort_cfg.post_processor.in_channels = 512
+hort_cfg.post_processor.out_channels = 3
+hort_cfg.pointcloud_upsampler_cls = "tgs.models.snowflake.model_spdpp.SnowflakeModelSPDPP"
+hort_cfg.pointcloud_upsampler = CN()
+hort_cfg.pointcloud_upsampler.input_channels = 1024
+hort_cfg.pointcloud_upsampler.dim_feat = 128
+hort_cfg.pointcloud_upsampler.num_p0 = 2048
+hort_cfg.pointcloud_upsampler.radius = 1
+hort_cfg.pointcloud_upsampler.bounding = True
+hort_cfg.pointcloud_upsampler.use_fps = True
+hort_cfg.pointcloud_upsampler.up_factors = [2, 4]
+hort_cfg.pointcloud_upsampler.token_type = "image_token"
+class model(nn.Module):
+    def __init__(self):
+        super(model, self).__init__()
+        self.image_tokenizer = tgs.find(hort_cfg.image_tokenizer_cls)(hort_cfg.image_tokenizer)
+        self.pointnet = PointNetEncoder(67, 1024)
+        self.tokenizer = tgs.find(hort_cfg.tokenizer_cls)(hort_cfg.tokenizer)
+        self.backbone = tgs.find(hort_cfg.backbone_cls)(hort_cfg.backbone)
+        self.post_processor = tgs.find(hort_cfg.post_processor_cls)(hort_cfg.post_processor)
+        self.post_processor_trans = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, 3))
+        self.pointcloud_upsampler = tgs.find(hort_cfg.pointcloud_upsampler_cls)(hort_cfg.pointcloud_upsampler)
+    def forward(self, input_img, metas):
+        with torch.no_grad():
+            batch_size = input_img.shape[0]
+            encoder_hidden_states = self.image_tokenizer(input_img, None) # B * C * Nt
+            encoder_hidden_states = encoder_hidden_states.transpose(2, 1) # B * Nt * C
+            palm_norm_hand_verts_3d = metas['right_hand_verts_3d'] - metas['right_hand_palm'].unsqueeze(1)
+            point_idx = torch.arange(778).view(1, 778, 1).expand(batch_size, -1, -1).to(input_img.device) / 778.
+            palm_norm_hand_verts_3d = torch.cat([palm_norm_hand_verts_3d, point_idx], -1)
+            tip_norm_hand_verts_3d = (metas['right_hand_verts_3d'].unsqueeze(2) - metas['right_hand_joints_3d'].unsqueeze(1)).reshape((batch_size, 778, -1))
+            norm_hand_verts_3d = torch.cat([palm_norm_hand_verts_3d, tip_norm_hand_verts_3d], -1)
+            hand_feats = self.pointnet(norm_hand_verts_3d)
+            tokens = self.tokenizer(batch_size)
+            tokens = self.backbone(tokens, torch.cat([encoder_hidden_states, hand_feats.unsqueeze(1)], 1), modulation_cond=None)
+            tokens = self.tokenizer.detokenize(tokens)
+            pointclouds = self.post_processor(tokens[:, :2048, :])
+            pred_obj_trans = self.post_processor_trans(tokens[:, -1, :])
+            upsampling_input = {
+                "input_image_tokens": encoder_hidden_states.permute(0, 2, 1),
+                "intrinsic_cond": metas['cam_intr'],
+                "points": pointclouds,
+                "hand_points": metas["right_hand_verts_3d"],
+                "trans": pred_obj_trans + metas['right_hand_palm'],
+                "scale": 0.3
+            }
+            up_results = self.pointcloud_upsampler(upsampling_input)
+            pointclouds_up = up_results[-1]
+            pc_results = {}
+            pc_results['pointclouds'] = pointclouds
+            pc_results['objtrans'] = pred_obj_trans
+            pc_results['handpalm'] = metas['right_hand_palm']
+            pc_results['pointclouds_up'] = pointclouds_up
+            return pc_results
+def load_hort(ckpt_path):
+    hort_model = model()
+    ckpt = torch.load(ckpt_path, map_location=torch.device('cpu'))["network"]
+    ckpt = {k.replace('module.', ''): v for k, v in ckpt.items()}
+    hort_model.load_state_dict(ckpt)
+    return hort_model

hort/models/network/pointnet.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+class PointNetEncoder(nn.Module):
+    """Encoder for Pointcloud
+    """
+    def __init__(self, in_channels: int=3, output_channels: int=768):
+        super().__init__()
+        block_channel = [64, 128, 256, 512]
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, block_channel[0]),
+            nn.LayerNorm(block_channel[0]),
+            nn.ReLU(),
+            nn.Linear(block_channel[0], block_channel[1]),
+            nn.LayerNorm(block_channel[1]),
+            nn.ReLU(),
+            nn.Linear(block_channel[1], block_channel[2]),
+            nn.LayerNorm(block_channel[2]),
+            nn.ReLU(),
+            nn.Linear(block_channel[2], block_channel[3]),
+            nn.LayerNorm(block_channel[3]),
+            nn.ReLU(),
+        )
+        self.final_projection = nn.Sequential(
+            nn.Linear(block_channel[-1], output_channels),
+            nn.LayerNorm(output_channels)
+        )
+    def forward(self, x):
+        x = self.mlp(x)
+        x = torch.max(x, 1)[0]
+        x = self.final_projection(x)
+        return x

hort/models/tgs/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import importlib
+from tgs.utils.typing import *
+def find(cls_string) -> Type:
+    module_string = ".".join(cls_string.split(".")[:-1])
+    cls_name = cls_string.split(".")[-1]
+    module = importlib.import_module(module_string, package=None)
+    cls = getattr(module, cls_name)
+    return cls

hort/models/tgs/data.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import json
+import math
+from dataclasses import dataclass, field
+import os
+import imageio
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import Dataset
+from tgs.utils.config import parse_structured
+from tgs.utils.ops import get_intrinsic_from_fov, get_ray_directions, get_rays
+from tgs.utils.typing import *
+def _parse_scene_list_single(scene_list_path: str):
+    if scene_list_path.endswith(".json"):
+        with open(scene_list_path) as f:
+            all_scenes = json.loads(f.read())
+    elif scene_list_path.endswith(".txt"):
+        with open(scene_list_path) as f:
+            all_scenes = [p.strip() for p in f.readlines()]
+    else:
+        all_scenes = [scene_list_path]
+    return all_scenes
+def _parse_scene_list(scene_list_path: Union[str, List[str]]):
+    all_scenes = []
+    if isinstance(scene_list_path, str):
+        scene_list_path = [scene_list_path]
+    for scene_list_path_ in scene_list_path:
+        all_scenes += _parse_scene_list_single(scene_list_path_)
+    return all_scenes
+@dataclass
+class CustomImageDataModuleConfig:
+    image_list: Any = ""
+    background_color: Tuple[float, float, float] = field(
+        default_factory=lambda: (1.0, 1.0, 1.0)
+    )
+    relative_pose: bool = False
+    cond_height: int = 512
+    cond_width: int = 512
+    cond_camera_distance: float = 1.6
+    cond_fovy_deg: float = 40.0
+    cond_elevation_deg: float = 0.0
+    cond_azimuth_deg: float = 0.0
+    num_workers: int = 16
+    eval_height: int = 512
+    eval_width: int = 512
+    eval_batch_size: int = 1
+    eval_elevation_deg: float = 0.0
+    eval_camera_distance: float = 1.6
+    eval_fovy_deg: float = 40.0
+    n_test_views: int = 120
+    num_views_output: int = 120
+    only_3dgs: bool = False
+class CustomImageOrbitDataset(Dataset):
+    def __init__(self, cfg: Any) -> None:
+        super().__init__()
+        self.cfg: CustomImageDataModuleConfig = parse_structured(CustomImageDataModuleConfig, cfg)
+        self.n_views = self.cfg.n_test_views
+        assert self.n_views % self.cfg.num_views_output == 0
+        self.all_scenes = _parse_scene_list(self.cfg.image_list)
+        azimuth_deg: Float[Tensor, "B"] = torch.linspace(0, 360.0, self.n_views + 1)[
+            : self.n_views
+        ]
+        elevation_deg: Float[Tensor, "B"] = torch.full_like(
+            azimuth_deg, self.cfg.eval_elevation_deg
+        )
+        camera_distances: Float[Tensor, "B"] = torch.full_like(
+            elevation_deg, self.cfg.eval_camera_distance
+        )
+        elevation = elevation_deg * math.pi / 180
+        azimuth = azimuth_deg * math.pi / 180
+        # convert spherical coordinates to cartesian coordinates
+        # right hand coordinate system, x back, y right, z up
+        # elevation in (-90, 90), azimuth from +x to +y in (-180, 180)
+        camera_positions: Float[Tensor, "B 3"] = torch.stack(
+            [
+                camera_distances * torch.cos(elevation) * torch.cos(azimuth),
+                camera_distances * torch.cos(elevation) * torch.sin(azimuth),
+                camera_distances * torch.sin(elevation),
+            ],
+            dim=-1,
+        )
+        # default scene center at origin
+        center: Float[Tensor, "B 3"] = torch.zeros_like(camera_positions)
+        # default camera up direction as +z
+        up: Float[Tensor, "B 3"] = torch.as_tensor([0, 0, 1], dtype=torch.float32)[
+            None, :
+        ].repeat(self.n_views, 1)
+        fovy_deg: Float[Tensor, "B"] = torch.full_like(
+            elevation_deg, self.cfg.eval_fovy_deg
+        )
+        fovy = fovy_deg * math.pi / 180
+        lookat: Float[Tensor, "B 3"] = F.normalize(center - camera_positions, dim=-1)
+        right: Float[Tensor, "B 3"] = F.normalize(torch.cross(lookat, up), dim=-1)
+        up = F.normalize(torch.cross(right, lookat), dim=-1)
+        c2w3x4: Float[Tensor, "B 3 4"] = torch.cat(
+            [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]],
+            dim=-1,
+        )
+        c2w: Float[Tensor, "B 4 4"] = torch.cat(
+            [c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1
+        )
+        c2w[:, 3, 3] = 1.0
+        # get directions by dividing directions_unit_focal by focal length
+        focal_length: Float[Tensor, "B"] = (
+            0.5 * self.cfg.eval_height / torch.tan(0.5 * fovy)
+        )
+        directions_unit_focal = get_ray_directions(
+            H=self.cfg.eval_height,
+            W=self.cfg.eval_width,
+            focal=1.0,
+        )
+        directions: Float[Tensor, "B H W 3"] = directions_unit_focal[
+            None, :, :, :
+        ].repeat(self.n_views, 1, 1, 1)
+        directions[:, :, :, :2] = (
+            directions[:, :, :, :2] / focal_length[:, None, None, None]
+        )
+        # must use normalize=True to normalize directions here
+        rays_o, rays_d = get_rays(directions, c2w, keepdim=True)
+        intrinsic: Float[Tensor, "B 3 3"] = get_intrinsic_from_fov(
+            self.cfg.eval_fovy_deg * math.pi / 180,
+            H=self.cfg.eval_height,
+            W=self.cfg.eval_width,
+            bs=self.n_views,
+        )
+        intrinsic_normed: Float[Tensor, "B 3 3"] = intrinsic.clone()
+        intrinsic_normed[..., 0, 2] /= self.cfg.eval_width
+        intrinsic_normed[..., 1, 2] /= self.cfg.eval_height
+        intrinsic_normed[..., 0, 0] /= self.cfg.eval_width
+        intrinsic_normed[..., 1, 1] /= self.cfg.eval_height
+        self.rays_o, self.rays_d = rays_o, rays_d
+        self.intrinsic = intrinsic
+        self.intrinsic_normed = intrinsic_normed
+        self.c2w = c2w
+        self.camera_positions = camera_positions
+        self.background_color = torch.as_tensor(self.cfg.background_color)
+        # condition
+        self.intrinsic_cond = get_intrinsic_from_fov(
+            np.deg2rad(self.cfg.cond_fovy_deg),
+            H=self.cfg.cond_height,
+            W=self.cfg.cond_width,
+        )
+        self.intrinsic_normed_cond = self.intrinsic_cond.clone()
+        self.intrinsic_normed_cond[..., 0, 2] /= self.cfg.cond_width
+        self.intrinsic_normed_cond[..., 1, 2] /= self.cfg.cond_height
+        self.intrinsic_normed_cond[..., 0, 0] /= self.cfg.cond_width
+        self.intrinsic_normed_cond[..., 1, 1] /= self.cfg.cond_height
+        if self.cfg.relative_pose:
+            self.c2w_cond = torch.as_tensor(
+                [
+                    [0, 0, 1, self.cfg.cond_camera_distance],
+                    [1, 0, 0, 0],
+                    [0, 1, 0, 0],
+                    [0, 0, 0, 1],
+                ]
+            ).float()
+        else:
+            cond_elevation = self.cfg.cond_elevation_deg * math.pi / 180
+            cond_azimuth = self.cfg.cond_azimuth_deg * math.pi / 180
+            cond_camera_position: Float[Tensor, "3"] = torch.as_tensor(
+                    [
+                        self.cfg.cond_camera_distance * np.cos(cond_elevation) * np.cos(cond_azimuth),
+                        self.cfg.cond_camera_distance * np.cos(cond_elevation) * np.sin(cond_azimuth),
+                        self.cfg.cond_camera_distance * np.sin(cond_elevation),
+                    ], dtype=torch.float32
+            )
+            cond_center: Float[Tensor, "3"] = torch.zeros_like(cond_camera_position)
+            cond_up: Float[Tensor, "3"] = torch.as_tensor([0, 0, 1], dtype=torch.float32)
+            cond_lookat: Float[Tensor, "3"] = F.normalize(cond_center - cond_camera_position, dim=-1)
+            cond_right: Float[Tensor, "3"] = F.normalize(torch.cross(cond_lookat, cond_up), dim=-1)
+            cond_up = F.normalize(torch.cross(cond_right, cond_lookat), dim=-1)
+            cond_c2w3x4: Float[Tensor, "3 4"] = torch.cat(
+                [torch.stack([cond_right, cond_up, -cond_lookat], dim=-1), cond_camera_position[:, None]],
+                dim=-1,
+            )
+            cond_c2w: Float[Tensor, "4 4"] = torch.cat(
+                [cond_c2w3x4, torch.zeros_like(cond_c2w3x4[:1])], dim=0
+            )
+            cond_c2w[3, 3] = 1.0
+            self.c2w_cond = cond_c2w
+    def __len__(self):
+        if self.cfg.only_3dgs:
+            return len(self.all_scenes)
+        else:
+            return len(self.all_scenes) * self.n_views // self.cfg.num_views_output
+    def __getitem__(self, index):
+        if self.cfg.only_3dgs:
+            scene_index = index
+            view_index = [0]
+        else:
+            scene_index = index * self.cfg.num_views_output // self.n_views
+            view_start = index % (self.n_views // self.cfg.num_views_output)
+            view_index = list(range(self.n_views))[view_start * self.cfg.num_views_output :
+                                                (view_start + 1) * self.cfg.num_views_output]
+        img_path = self.all_scenes[scene_index]
+        img_cond = torch.from_numpy(
+            np.asarray(
+                Image.fromarray(imageio.v2.imread(img_path))
+                .convert("RGBA")
+                .resize((self.cfg.cond_width, self.cfg.cond_height))
+            )
+            / 255.0
+        ).float()
+        mask_cond: Float[Tensor, "Hc Wc 1"] = img_cond[:, :, -1:]
+        rgb_cond: Float[Tensor, "Hc Wc 3"] = img_cond[
+            :, :, :3
+        ] * mask_cond + self.background_color[None, None, :] * (1 - mask_cond)
+        out = {
+            "rgb_cond": rgb_cond.unsqueeze(0),
+            "c2w_cond": self.c2w_cond.unsqueeze(0),
+            "mask_cond": mask_cond.unsqueeze(0),
+            "intrinsic_cond": self.intrinsic_cond.unsqueeze(0),
+            "intrinsic_normed_cond": self.intrinsic_normed_cond.unsqueeze(0),
+            "view_index": torch.as_tensor(view_index),
+            "rays_o": self.rays_o[view_index],
+            "rays_d": self.rays_d[view_index],
+            "intrinsic": self.intrinsic[view_index],
+            "intrinsic_normed": self.intrinsic_normed[view_index],
+            "c2w": self.c2w[view_index],
+            "camera_positions": self.camera_positions[view_index],
+        }
+        out["c2w"][..., :3, 1:3] *= -1
+        out["c2w_cond"][..., :3, 1:3] *= -1
+        instance_id = os.path.split(img_path)[-1].split('.')[0]
+        out["index"] = torch.as_tensor(scene_index)
+        out["background_color"] = self.background_color
+        out["instance_id"] = instance_id
+        return out
+    def collate(self, batch):
+        batch = torch.utils.data.default_collate(batch)
+        batch.update({"height": self.cfg.eval_height, "width": self.cfg.eval_width})
+        return batch

hort/models/tgs/models/__init__.py ADDED Viewed

File without changes

hort/models/tgs/models/image_feature.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from tgs.utils.base import BaseModule
+from tgs.utils.ops import compute_distance_transform
+from tgs.utils.typing import *
+class ImageFeature(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        use_rgb: bool = True
+        use_feature: bool = True
+        use_mask: bool = True
+        feature_dim: int = 128
+        out_dim: int = 133
+        backbone: str = "default"
+        freeze_backbone_params: bool = True
+    cfg: Config
+    def forward(self, rgb, mask=None, feature=None):
+        B, Nv, H, W = rgb.shape[:4]
+        rgb = rearrange(rgb, "B Nv H W C -> (B Nv) C H W")
+        if mask is not None:
+            mask = rearrange(mask, "B Nv H W C -> (B Nv) C H W")
+        assert feature is not None
+        # reshape dino tokens to image-like size
+        feature = rearrange(feature, "B (Nv Nt) C -> (B Nv) Nt C", Nv=Nv)
+        feature = feature[:, 1:].reshape(B * Nv, H // 14, W // 14, -1).permute(0, 3, 1, 2).contiguous()
+        feature = F.interpolate(feature, size=(H, W), mode='bilinear', align_corners=False)
+        if mask is not None and mask.is_floating_point():
+            mask = mask > 0.5
+        image_features = []
+        if self.cfg.use_rgb:
+            image_features.append(rgb)
+        if self.cfg.use_feature:
+            image_features.append(feature)
+        if self.cfg.use_mask:
+            image_features += [mask, compute_distance_transform(mask)]
+        # detach features, occur error when with grad
+        image_features = torch.cat(image_features, dim=1)#.detach()
+        return rearrange(image_features, "(B Nv) C H W -> B Nv C H W", B=B, Nv=Nv).squeeze(1)

hort/models/tgs/models/networks.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange
+import numpy as np
+from tgs.utils.base import BaseModule
+from tgs.utils.ops import get_activation
+from tgs.utils.typing import *
+class PointOutLayer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 1024
+        out_channels: int = 3
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.point_layer = nn.Linear(self.cfg.in_channels, self.cfg.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        nn.init.constant_(self.point_layer.weight, 0)
+        nn.init.constant_(self.point_layer.bias, 0)
+    def forward(self, x):
+        return self.point_layer(x)
+class TriplaneUpsampleNetwork(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 1024
+        out_channels: int = 80
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.upsample = nn.ConvTranspose2d(
+            self.cfg.in_channels, self.cfg.out_channels, kernel_size=2, stride=2
+        )
+    def forward(
+        self, triplanes: Float[Tensor, "B 3 Ci Hp Wp"]
+    ) -> Float[Tensor, "B 3 Co Hp2 Wp2"]:
+        triplanes_up = rearrange(
+            self.upsample(
+                rearrange(triplanes, "B Np Ci Hp Wp -> (B Np) Ci Hp Wp", Np=3)
+            ),
+            "(B Np) Co Hp Wp -> B Np Co Hp Wp",
+            Np=3,
+        )
+        return triplanes_up
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        n_neurons: int,
+        n_hidden_layers: int,
+        activation: str = "relu",
+        output_activation: Optional[str] = None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        layers = [
+            self.make_linear(
+                dim_in, n_neurons, is_first=True, is_last=False, bias=bias
+            ),
+            self.make_activation(activation),
+        ]
+        for i in range(n_hidden_layers - 1):
+            layers += [
+                self.make_linear(
+                    n_neurons, n_neurons, is_first=False, is_last=False, bias=bias
+                ),
+                self.make_activation(activation),
+            ]
+        layers += [
+            self.make_linear(
+                n_neurons, dim_out, is_first=False, is_last=True, bias=bias
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.output_activation = get_activation(output_activation)
+    def forward(self, x):
+        x = self.layers(x)
+        x = self.output_activation(x)
+        return x
+    def make_linear(self, dim_in, dim_out, is_first, is_last, bias=True):
+        layer = nn.Linear(dim_in, dim_out, bias=bias)
+        return layer
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+class GSProjection(nn.Module):
+    def __init__(self,
+                in_channels: int = 80,
+                sh_degree: int = 3,
+                init_scaling: float = -5.0,
+                init_density: float = 0.1) -> None:
+        super().__init__()
+        self.out_keys = GS_KEYS + ["shs"]
+        self.out_channels = GS_CHANNELS + [(sh_degree + 1) ** 2 * 3]
+        self.out_layers = nn.ModuleList()
+        for key, ch in zip(self.out_keys, self.out_channels):
+            layer = nn.Linear(in_channels, ch)
+            # initialize
+            nn.init.constant_(layer.weight, 0)
+            nn.init.constant_(layer.bias, 0)
+            if key == "scaling":
+                nn.init.constant_(layer.bias, init_scaling)
+            elif key == "rotation":
+                nn.init.constant_(layer.bias, 0)
+                nn.init.constant_(layer.bias[0], 1.0)
+            elif key == "opacity":
+                inverse_sigmoid = lambda x: np.log(x / (1 - x))
+                nn.init.constant_(layer.bias, inverse_sigmoid(init_density))
+            self.out_layers.append(layer)
+    def forward(self, x):
+        ret = []
+        for k, layer in zip(self.out_keys, self.out_layers):
+            v = layer(x)
+            if k == "rotation":
+                v = torch.nn.functional.normalize(v)
+            elif k == "scaling":
+                v = torch.exp(v)
+                # v = v.detach() # FIXME: for DEBUG
+            elif k == "opacity":
+                v = torch.sigmoid(v)
+            # elif k == "shs":
+            #     v = torch.reshape(v, (v.shape[0], -1, 3))
+            ret.append(v)
+        ret = torch.cat(ret, dim=-1)
+        return ret
+def get_encoding(n_input_dims: int, config) -> nn.Module:
+    raise NotImplementedError
+def get_mlp(n_input_dims, n_output_dims, config) -> nn.Module:
+    raise NotImplementedError
+# Resnet Blocks for pointnet
+class ResnetBlockFC(nn.Module):
+    ''' Fully connected ResNet Block class.
+    Args:
+        size_in (int): input dimension
+        size_out (int): output dimension
+        size_h (int): hidden dimension
+    '''
+    def __init__(self, size_in, size_out=None, size_h=None):
+        super().__init__()
+        # Attributes
+        if size_out is None:
+            size_out = size_in
+        if size_h is None:
+            size_h = min(size_in, size_out)
+        self.size_in = size_in
+        self.size_h = size_h
+        self.size_out = size_out
+        # Submodules
+        self.fc_0 = nn.Linear(size_in, size_h)
+        self.fc_1 = nn.Linear(size_h, size_out)
+        self.actvn = nn.ReLU()
+        if size_in == size_out:
+            self.shortcut = None
+        else:
+            self.shortcut = nn.Linear(size_in, size_out, bias=False)
+        # Initialization
+        nn.init.zeros_(self.fc_1.weight)
+    def forward(self, x):
+        net = self.fc_0(self.actvn(x))
+        dx = self.fc_1(self.actvn(net))
+        if self.shortcut is not None:
+            x_s = self.shortcut(x)
+        else:
+            x_s = x
+        return x_s + dx

hort/models/tgs/models/pointclouds/LICENSE_POINTNET ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Songyou Peng, Michael Niemeyer, Lars Mescheder, Marc Pollefeys, Andreas Geiger
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hort/models/tgs/models/pointclouds/pointnet.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# modified from https://github.com/autonomousvision/convolutional_occupancy_networks/blob/master/src/encoder/pointnet.py
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch_scatter import scatter_mean, scatter_max
+from tgs.utils.base import BaseModule
+from tgs.models.networks import ResnetBlockFC
+from tgs.utils.ops import scale_tensor
+class LocalPoolPointnet(BaseModule):
+    ''' PointNet-based encoder network with ResNet blocks for each point.
+        Number of input points are fixed.
+    Args:
+        c_dim (int): dimension of latent code c
+        dim (int): input points dimension
+        hidden_dim (int): hidden dimension of the network
+        scatter_type (str): feature aggregation when doing local pooling
+        plane_resolution (int): defined resolution for plane feature
+        padding (float): conventional padding paramter of ONet for unit cube, so [-0.5, 0.5] -> [-0.55, 0.55]
+        n_blocks (int): number of blocks ResNetBlockFC layers
+    '''
+    @dataclass
+    class Config(BaseModule.Config):
+        input_channels: int = 3
+        c_dim: int = 128
+        hidden_dim: int = 128
+        scatter_type: str = "max"
+        plane_size: int = 32
+        n_blocks: int = 5
+        radius: float = 1.
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.fc_pos = nn.Linear(self.cfg.input_channels, 2 * self.cfg.hidden_dim)
+        self.blocks = nn.ModuleList([
+            ResnetBlockFC(2 * self.cfg.hidden_dim, self.cfg.hidden_dim) for i in range(self.cfg.n_blocks)
+        ])
+        self.fc_c = nn.Linear(self.cfg.hidden_dim, self.cfg.c_dim)
+        self.actvn = nn.ReLU()
+        if self.cfg.scatter_type == 'max':
+            self.scatter = scatter_max
+        elif self.cfg.scatter_type == 'mean':
+            self.scatter = scatter_mean
+        else:
+            raise ValueError('incorrect scatter type')
+    def generate_plane_features(self, index, c):
+        # acquire indices of features in plane
+        # xy = normalize_coordinate(p.clone(), plane=plane, padding=self.padding) # normalize to the range of (0, 1)
+        # index = self.coordinate2index(x, self.cfg.plane_size)
+        # scatter plane features from points
+        fea_plane = c.new_zeros(index.shape[0], self.cfg.c_dim, self.cfg.plane_size ** 2)
+        c = c.permute(0, 2, 1) # B x 512 x T
+        fea_plane = scatter_mean(c, index, out=fea_plane) # B x 512 x reso^2
+        fea_plane = fea_plane.reshape(index.shape[0], self.cfg.c_dim, self.cfg.plane_size, self.cfg.plane_size) # sparce matrix (B x 512 x reso x reso)
+        return fea_plane
+    def pool_local(self, xy, index, c):
+        bs, fea_dim = c.shape[0], c.shape[2]
+        keys = xy.keys()
+        c_out = 0
+        for key in keys:
+            # scatter plane features from points
+            fea = self.scatter(c.permute(0, 2, 1), index[key], dim_size=self.cfg.plane_size ** 2)
+            if self.scatter == scatter_max:
+                fea = fea[0]
+            # gather feature back to points
+            fea = fea.gather(dim=2, index=index[key].expand(-1, fea_dim, -1))
+            c_out += fea
+        return c_out.permute(0, 2, 1)
+    def coordinate2index(self, x):
+        x = (x * self.cfg.plane_size).long()
+        index = x[..., 0] + self.cfg.plane_size * x[..., 1]
+        assert index.max() < self.cfg.plane_size ** 2
+        return index[:, None, :]
+    def forward(self, p):
+        batch_size, T, D = p.shape
+        # acquire the index for each point
+        coord = {}
+        index = {}
+        position = torch.clamp(p[..., :3], -self.cfg.radius + 1e-6, self.cfg.radius - 1e-6)
+        position_norm = scale_tensor(position, (-self.cfg.radius, self.cfg.radius), (0, 1))
+        coord["xy"] = position_norm[..., [0, 1]]
+        coord["xz"] = position_norm[..., [0, 2]]
+        coord["yz"] = position_norm[..., [1, 2]]
+        index["xy"] = self.coordinate2index(coord["xy"])
+        index["xz"] = self.coordinate2index(coord["xz"])
+        index["yz"] = self.coordinate2index(coord["yz"])
+        net = self.fc_pos(p)
+        net = self.blocks[0](net)
+        for block in self.blocks[1:]:
+            pooled = self.pool_local(coord, index, net)
+            net = torch.cat([net, pooled], dim=2)
+            net = block(net)
+        c = self.fc_c(net)
+        features = torch.stack([
+            self.generate_plane_features(index["xy"], c),
+            self.generate_plane_features(index["xz"], c),
+            self.generate_plane_features(index["yz"], c)
+        ], dim=1)
+        return features

hort/models/tgs/models/pointclouds/simplepoint.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from dataclasses import dataclass, field
+import torch
+from einops import rearrange
+import tgs
+from tgs.utils.base import BaseModule
+from tgs.utils.typing import *
+class SimplePointGenerator(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        camera_embedder_cls: str = ""
+        camera_embedder: dict = field(default_factory=dict)
+        image_tokenizer_cls: str = ""
+        image_tokenizer: dict = field(default_factory=dict)
+        tokenizer_cls: str = ""
+        tokenizer: dict = field(default_factory=dict)
+        backbone_cls: str = ""
+        backbone: dict = field(default_factory=dict)
+        post_processor_cls: str = ""
+        post_processor: dict = field(default_factory=dict)
+        pointcloud_upsampling_cls: str = ""
+        pointcloud_upsampling: dict = field(default_factory=dict)
+        flip_c2w_cond: bool = True
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.image_tokenizer = tgs.find(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        )
+        assert self.cfg.camera_embedder_cls == 'tgs.models.networks.MLP'
+        weights = self.cfg.camera_embedder.pop("weights") if "weights" in self.cfg.camera_embedder else None
+        self.camera_embedder = tgs.find(self.cfg.camera_embedder_cls)(**self.cfg.camera_embedder)
+        if weights:
+            from tgs.utils.misc import load_module_weights
+            weights_path, module_name = weights.split(":")
+            state_dict = load_module_weights(
+                weights_path, module_name=module_name, map_location="cpu"
+            )
+            self.camera_embedder.load_state_dict(state_dict)
+        self.tokenizer = tgs.find(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
+        self.backbone = tgs.find(self.cfg.backbone_cls)(self.cfg.backbone)
+        self.post_processor = tgs.find(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        )
+        self.pointcloud_upsampling = tgs.find(self.cfg.pointcloud_upsampling_cls)(self.cfg.pointcloud_upsampling)
+    def forward(self, batch, encoder_hidden_states=None, **kwargs):
+        batch_size, n_input_views = batch["rgb_cond"].shape[:2]
+        if encoder_hidden_states is None:
+            # Camera modulation
+            c2w_cond = batch["c2w_cond"].clone()
+            if self.cfg.flip_c2w_cond:
+                c2w_cond[..., :3, 1:3] *= -1
+            camera_extri = c2w_cond.view(*c2w_cond.shape[:-2], -1)
+            camera_intri = batch["intrinsic_normed_cond"].view(
+                *batch["intrinsic_normed_cond"].shape[:-2], -1)
+            camera_feats = torch.cat([camera_intri, camera_extri], dim=-1)
+            # camera_feats = rearrange(camera_feats, 'B Nv C -> (B Nv) C')
+            camera_feats = self.camera_embedder(camera_feats)
+            encoder_hidden_states: Float[Tensor, "B Cit Nit"] = self.image_tokenizer(
+                rearrange(batch["rgb_cond"], 'B Nv H W C -> B Nv C H W'),
+                modulation_cond=camera_feats,
+            )
+            encoder_hidden_states = rearrange(
+                encoder_hidden_states, 'B Nv C Nt -> B (Nv Nt) C', Nv=n_input_views)
+        tokens: Float[Tensor, "B Ct Nt"] = self.tokenizer(batch_size)
+        tokens = self.backbone(
+            tokens,
+            encoder_hidden_states=encoder_hidden_states,
+            modulation_cond=None,
+        )
+        pointclouds = self.post_processor(self.tokenizer.detokenize(tokens))
+        upsampling_input = {
+            "input_image_tokens": encoder_hidden_states.permute(0, 2, 1),
+            "input_image_tokens_global": encoder_hidden_states[:, :1],
+            "c2w_cond": c2w_cond,
+            "rgb_cond": batch["rgb_cond"],
+            "intrinsic_cond": batch["intrinsic_cond"],
+            "intrinsic_normed_cond": batch["intrinsic_normed_cond"],
+            "points": pointclouds.float()
+        }
+        up_results = self.pointcloud_upsampling(upsampling_input)
+        up_results.insert(0, pointclouds)
+        pointclouds = up_results[-1]
+        out = {
+            "points": pointclouds,
+            "up_results": up_results
+        }
+        return out

hort/models/tgs/models/renderer.py ADDED Viewed

	@@ -0,0 +1,427 @@

+from dataclasses import dataclass, field
+from collections import defaultdict
+from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer
+from plyfile import PlyData, PlyElement
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+from tgs.utils.typing import *
+from tgs.utils.base import BaseModule
+from tgs.utils.ops import trunc_exp
+from tgs.models.networks import MLP
+from tgs.utils.ops import scale_tensor
+from einops import rearrange, reduce
+inverse_sigmoid = lambda x: np.log(x / (1 - x))
+def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    C2W = np.linalg.inv(Rt)
+    cam_center = C2W[:3, 3]
+    cam_center = (cam_center + translate) * scale
+    C2W[:3, 3] = cam_center
+    Rt = np.linalg.inv(C2W)
+    return np.float32(Rt)
+def getProjectionMatrix(znear, zfar, fovX, fovY):
+    tanHalfFovY = math.tan((fovY / 2))
+    tanHalfFovX = math.tan((fovX / 2))
+    top = tanHalfFovY * znear
+    bottom = -top
+    right = tanHalfFovX * znear
+    left = -right
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 2.0 * znear / (right - left)
+    P[1, 1] = 2.0 * znear / (top - bottom)
+    P[0, 2] = (right + left) / (right - left)
+    P[1, 2] = (top + bottom) / (top - bottom)
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+def intrinsic_to_fov(intrinsic, w, h):
+    fx, fy = intrinsic[0, 0], intrinsic[1, 1]
+    fov_x = 2 * torch.arctan2(w, 2 * fx)
+    fov_y = 2 * torch.arctan2(h, 2 * fy)
+    return fov_x, fov_y
+class Camera:
+    def __init__(self, w2c, intrinsic, FoVx, FoVy, height, width, trans=np.array([0.0, 0.0, 0.0]), scale=1.0) -> None:
+        self.FoVx = FoVx
+        self.FoVy = FoVy
+        self.height = height
+        self.width = width
+        self.world_view_transform = w2c.transpose(0, 1)
+        self.zfar = 100.0
+        self.znear = 0.01
+        self.trans = trans
+        self.scale = scale
+        self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy).transpose(0,1).to(w2c.device)
+        self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0)
+        self.camera_center = self.world_view_transform.inverse()[3, :3]
+    @staticmethod
+    def from_c2w(c2w, intrinsic, height, width):
+        w2c = torch.inverse(c2w)
+        FoVx, FoVy = intrinsic_to_fov(intrinsic, w=torch.tensor(width, device=w2c.device), h=torch.tensor(height, device=w2c.device))
+        return Camera(w2c=w2c, intrinsic=intrinsic, FoVx=FoVx, FoVy=FoVy, height=height, width=width)
+class GaussianModel(NamedTuple):
+    xyz: Tensor
+    opacity: Tensor
+    rotation: Tensor
+    scaling: Tensor
+    shs: Tensor
+    def construct_list_of_attributes(self):
+        l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+        features_dc = self.shs[:, :1]
+        features_rest = self.shs[:, 1:]
+        for i in range(features_dc.shape[1]*features_dc.shape[2]):
+            l.append('f_dc_{}'.format(i))
+        for i in range(features_rest.shape[1]*features_rest.shape[2]):
+            l.append('f_rest_{}'.format(i))
+        l.append('opacity')
+        for i in range(self.scaling.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(self.rotation.shape[1]):
+            l.append('rot_{}'.format(i))
+        return l
+    def save_ply(self, path):
+        xyz = self.xyz.detach().cpu().numpy()
+        normals = np.zeros_like(xyz)
+        features_dc = self.shs[:, :1]
+        features_rest = self.shs[:, 1:]
+        f_dc = features_dc.detach().flatten(start_dim=1).contiguous().cpu().numpy()
+        f_rest = features_rest.detach().flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = inverse_sigmoid(torch.clamp(self.opacity, 1e-3, 1 - 1e-3).detach().cpu().numpy())
+        scale = np.log(self.scaling.detach().cpu().numpy())
+        rotation = self.rotation.detach().cpu().numpy()
+        dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
+        elements = np.empty(xyz.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+class GSLayer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int = 128
+        feature_channels: dict = field(default_factory=dict)
+        xyz_offset: bool = True
+        restrict_offset: bool = False
+        use_rgb: bool = False
+        clip_scaling: Optional[float] = None
+        init_scaling: float = -5.0
+        init_density: float = 0.1
+    cfg: Config
+    def configure(self, *args, **kwargs) -> None:
+        self.out_layers = nn.ModuleList()
+        for key, out_ch in self.cfg.feature_channels.items():
+            if key == "shs" and self.cfg.use_rgb:
+                out_ch = 3
+            layer = nn.Linear(self.cfg.in_channels, out_ch)
+            # initialize
+            if not (key == "shs" and self.cfg.use_rgb):
+                nn.init.constant_(layer.weight, 0)
+                nn.init.constant_(layer.bias, 0)
+            if key == "scaling":
+                nn.init.constant_(layer.bias, self.cfg.init_scaling)
+            elif key == "rotation":
+                nn.init.constant_(layer.bias, 0)
+                nn.init.constant_(layer.bias[0], 1.0)
+            elif key == "opacity":
+                nn.init.constant_(layer.bias, inverse_sigmoid(self.cfg.init_density))
+            self.out_layers.append(layer)
+    def forward(self, x, pts):
+        ret = {}
+        for k, layer in zip(self.cfg.feature_channels.keys(), self.out_layers):
+            v = layer(x)
+            if k == "rotation":
+                v = torch.nn.functional.normalize(v)
+            elif k == "scaling":
+                v = trunc_exp(v)
+                if self.cfg.clip_scaling is not None:
+                    v = torch.clamp(v, min=0, max=self.cfg.clip_scaling)
+            elif k == "opacity":
+                v = torch.sigmoid(v)
+            elif k == "shs":
+                if self.cfg.use_rgb:
+                    v = torch.sigmoid(v)
+                v = torch.reshape(v, (v.shape[0], -1, 3))
+            elif k == "xyz":
+                if self.cfg.restrict_offset:
+                    max_step = 1.2 / 32
+                    v = (torch.sigmoid(v) - 0.5) * max_step
+                v = v + pts if self.cfg.xyz_offset else pts
+            ret[k] = v
+        return GaussianModel(**ret)
+class GS3DRenderer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        mlp_network_config: Optional[dict] = None
+        gs_out: dict = field(default_factory=dict)
+        sh_degree: int = 3
+        scaling_modifier: float = 1.0
+        random_background: bool = False
+        radius: float = 1.0
+        feature_reduction: str = "concat"
+        projection_feature_dim: int = 773
+        background_color: Tuple[float, float, float] = field(
+            default_factory=lambda: (1.0, 1.0, 1.0)
+        )
+    cfg: Config
+    def configure(self, *args, **kwargs) -> None:
+        if self.cfg.feature_reduction == "mean":
+            mlp_in = 80
+        elif self.cfg.feature_reduction == "concat":
+            mlp_in = 80 * 3
+        else:
+            raise NotImplementedError
+        mlp_in = mlp_in + self.cfg.projection_feature_dim
+        if self.cfg.mlp_network_config is not None:
+            self.mlp_net = MLP(mlp_in, self.cfg.gs_out.in_channels, **self.cfg.mlp_network_config)
+        else:
+            self.cfg.gs_out.in_channels = mlp_in
+        self.gs_net = GSLayer(self.cfg.gs_out)
+    def forward_gs(self, x, p):
+        if self.cfg.mlp_network_config is not None:
+            x = self.mlp_net(x)
+        return self.gs_net(x, p)
+    def forward_single_view(self,
+        gs: GaussianModel,
+        viewpoint_camera: Camera,
+        background_color: Optional[Float[Tensor, "3"]],
+        ret_mask: bool = True,
+        ):
+        # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+        screenspace_points = torch.zeros_like(gs.xyz, dtype=gs.xyz.dtype, requires_grad=True, device=self.device) + 0
+        try:
+            screenspace_points.retain_grad()
+        except:
+            pass
+        bg_color = background_color
+        # Set up rasterization configuration
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+        raster_settings = GaussianRasterizationSettings(
+            image_height=int(viewpoint_camera.height),
+            image_width=int(viewpoint_camera.width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=bg_color,
+            scale_modifier=self.cfg.scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            projmatrix=viewpoint_camera.full_proj_transform.float(),
+            sh_degree=self.cfg.sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=False
+        )
+        rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+        means3D = gs.xyz
+        means2D = screenspace_points
+        opacity = gs.opacity
+        # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+        # scaling / rotation by the rasterizer.
+        scales = None
+        rotations = None
+        cov3D_precomp = None
+        scales = gs.scaling
+        rotations = gs.rotation
+        # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+        # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+        shs = None
+        colors_precomp = None
+        if self.gs_net.cfg.use_rgb:
+            colors_precomp = gs.shs.squeeze(1)
+        else:
+            shs = gs.shs
+        # Rasterize visible Gaussians to image, obtain their radii (on screen).
+        with torch.autocast(device_type=self.device.type, dtype=torch.float32):
+            rendered_image, radii = rasterizer(
+                means3D = means3D,
+                means2D = means2D,
+                shs = shs,
+                colors_precomp = colors_precomp,
+                opacities = opacity,
+                scales = scales,
+                rotations = rotations,
+                cov3D_precomp = cov3D_precomp)
+        ret = {
+            "comp_rgb": rendered_image.permute(1, 2, 0),
+            "comp_rgb_bg": bg_color
+        }
+        if ret_mask:
+            mask_bg_color = torch.zeros(3, dtype=torch.float32, device=self.device)
+            raster_settings = GaussianRasterizationSettings(
+                image_height=int(viewpoint_camera.height),
+                image_width=int(viewpoint_camera.width),
+                tanfovx=tanfovx,
+                tanfovy=tanfovy,
+                bg=mask_bg_color,
+                scale_modifier=self.cfg.scaling_modifier,
+                viewmatrix=viewpoint_camera.world_view_transform,
+                projmatrix=viewpoint_camera.full_proj_transform.float(),
+                sh_degree=0,
+                campos=viewpoint_camera.camera_center,
+                prefiltered=False,
+                debug=False
+            )
+            rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+            with torch.autocast(device_type=self.device.type, dtype=torch.float32):
+                rendered_mask, radii = rasterizer(
+                    means3D = means3D,
+                    means2D = means2D,
+                    # shs = ,
+                    colors_precomp = torch.ones_like(means3D),
+                    opacities = opacity,
+                    scales = scales,
+                    rotations = rotations,
+                    cov3D_precomp = cov3D_precomp)
+                ret["comp_mask"] = rendered_mask.permute(1, 2, 0)
+        return ret
+    def query_triplane(
+        self,
+        positions: Float[Tensor, "*B N 3"],
+        triplanes: Float[Tensor, "*B 3 Cp Hp Wp"],
+    ) -> Dict[str, Tensor]:
+        batched = positions.ndim == 3
+        if not batched:
+            # no batch dimension
+            triplanes = triplanes[None, ...]
+            positions = positions[None, ...]
+        positions = scale_tensor(positions, (-self.cfg.radius, self.cfg.radius), (-1, 1))
+        indices2D: Float[Tensor, "B 3 N 2"] = torch.stack(
+                (positions[..., [0, 1]], positions[..., [0, 2]], positions[..., [1, 2]]),
+                dim=-3,
+            )
+        out: Float[Tensor, "B3 Cp 1 N"] = F.grid_sample(
+            rearrange(triplanes, "B Np Cp Hp Wp -> (B Np) Cp Hp Wp", Np=3),
+            rearrange(indices2D, "B Np N Nd -> (B Np) () N Nd", Np=3),
+            align_corners=False,
+            mode="bilinear",
+        )
+        if self.cfg.feature_reduction == "concat":
+            out = rearrange(out, "(B Np) Cp () N -> B N (Np Cp)", Np=3)
+        elif self.cfg.feature_reduction == "mean":
+            out = reduce(out, "(B Np) Cp () N -> B N Cp", Np=3, reduction="mean")
+        else:
+            raise NotImplementedError
+        if not batched:
+            out = out.squeeze(0)
+        return out
+    def forward_single_batch(
+        self,
+        gs_hidden_features: Float[Tensor, "Np Cp"],
+        query_points: Float[Tensor, "Np 3"],
+        c2ws: Float[Tensor, "Nv 4 4"],
+        intrinsics: Float[Tensor, "Nv 4 4"],
+        height: int,
+        width: int,
+        background_color: Optional[Float[Tensor, "3"]],
+    ):
+        gs: GaussianModel = self.forward_gs(gs_hidden_features, query_points)
+        out_list = []
+        for c2w, intrinsic in zip(c2ws, intrinsics):
+            out_list.append(self.forward_single_view(
+                                gs,
+                                Camera.from_c2w(c2w, intrinsic, height, width),
+                                background_color
+                            ))
+        out = defaultdict(list)
+        for out_ in out_list:
+            for k, v in out_.items():
+                out[k].append(v)
+        out = {k: torch.stack(v, dim=0) for k, v in out.items()}
+        out["3dgs"] = gs
+        return out
+    def forward(self,
+        gs_hidden_features: Float[Tensor, "B Np Cp"],
+        query_points: Float[Tensor, "B Np 3"],
+        c2w: Float[Tensor, "B Nv 4 4"],
+        intrinsic: Float[Tensor, "B Nv 4 4"],
+        height,
+        width,
+        additional_features: Optional[Float[Tensor, "B C H W"]] = None,
+        background_color: Optional[Float[Tensor, "B 3"]] = None,
+        **kwargs):
+        batch_size = gs_hidden_features.shape[0]
+        out_list = []
+        gs_hidden_features = self.query_triplane(query_points, gs_hidden_features)
+        if additional_features is not None:
+            gs_hidden_features = torch.cat([gs_hidden_features, additional_features], dim=-1)
+        for b in range(batch_size):
+            out_list.append(self.forward_single_batch(
+                gs_hidden_features[b],
+                query_points[b],
+                c2w[b],
+                intrinsic[b],
+                height, width,
+                background_color[b] if background_color is not None else None))
+        out = defaultdict(list)
+        for out_ in out_list:
+            for k, v in out_.items():
+                out[k].append(v)
+        for k, v in out.items():
+            if isinstance(v[0], torch.Tensor):
+                out[k] = torch.stack(v, dim=0)
+            else:
+                out[k] = v
+        return out

hort/models/tgs/models/snowflake/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 AllenXiang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hort/models/tgs/models/snowflake/SPD.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# -*- coding: utf-8 -*-
+# @Author: Peng Xiang
+import torch
+import torch.nn as nn
+from .utils import MLP_Res, MLP_CONV
+from .skip_transformer import SkipTransformer
+class SPD(nn.Module):
+    def __init__(self, dim_feat=512, up_factor=2, i=0, radius=1, bounding=True, global_feat=True):
+        """Snowflake Point Deconvolution"""
+        super(SPD, self).__init__()
+        self.i = i
+        self.up_factor = up_factor
+        self.bounding = bounding
+        self.radius = radius
+        self.global_feat = global_feat
+        self.ps_dim = 32 if global_feat else 64
+        self.mlp_1 = MLP_CONV(in_channel=3, layer_dims=[64, 128])
+        self.mlp_2 = MLP_CONV(in_channel=128 * 2 + dim_feat if self.global_feat else 128, layer_dims=[256, 128])
+        self.skip_transformer = SkipTransformer(in_channel=128, dim=64)
+        self.mlp_ps = MLP_CONV(in_channel=128, layer_dims=[64, self.ps_dim])
+        self.ps = nn.ConvTranspose1d(self.ps_dim, 128, up_factor, up_factor, bias=False)   # point-wise splitting
+        self.up_sampler = nn.Upsample(scale_factor=up_factor)
+        self.mlp_delta_feature = MLP_Res(in_dim=256, hidden_dim=128, out_dim=128)
+        self.mlp_delta = MLP_CONV(in_channel=128, layer_dims=[64, 3])
+    def forward(self, pcd_prev, feat_global=None, K_prev=None):
+        """
+        Args:
+            pcd_prev: Tensor, (B, 3, N_prev)
+            feat_global: Tensor, (B, dim_feat, 1)
+            K_prev: Tensor, (B, 128, N_prev)
+        Returns:
+            pcd_child: Tensor, up sampled point cloud, (B, 3, N_prev * up_factor)
+            K_curr: Tensor, displacement feature of current step, (B, 128, N_prev * up_factor)
+        """
+        b, _, n_prev = pcd_prev.shape
+        feat_1 = self.mlp_1(pcd_prev)
+        feat_1 = torch.cat([feat_1,
+                            torch.max(feat_1, 2, keepdim=True)[0].repeat((1, 1, feat_1.size(2))),
+                            feat_global.repeat(1, 1, feat_1.size(2))], 1) if self.global_feat else feat_1
+        Q = self.mlp_2(feat_1)
+        H = self.skip_transformer(pcd_prev, K_prev if K_prev is not None else Q, Q)
+        feat_child = self.mlp_ps(H)
+        feat_child = self.ps(feat_child)  # (B, 128, N_prev * up_factor)
+        H_up = self.up_sampler(H)
+        K_curr = self.mlp_delta_feature(torch.cat([feat_child, H_up], 1))
+        delta = self.mlp_delta(torch.relu(K_curr))
+        if self.bounding:
+            delta = torch.tanh(delta) / self.radius**self.i  # (B, 3, N_prev * up_factor)
+        pcd_child = self.up_sampler(pcd_prev)
+        pcd_child = pcd_child + delta
+        return pcd_child, K_curr

hort/models/tgs/models/snowflake/SPD_crossattn.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# -*- coding: utf-8 -*-
+# @Author: Peng Xiang
+import torch
+import torch.nn as nn
+from .utils import MLP_Res, MLP_CONV
+from .skip_transformer import SkipTransformer
+from .attention import ResidualTransformerBlock
+class SPD_crossattn(nn.Module):
+    def __init__(self, dim_feat=512, up_factor=2, i=0, radius=1, bounding=True, global_feat=True):
+        """Snowflake Point Deconvolution"""
+        super().__init__()
+        self.i = i
+        self.up_factor = up_factor
+        self.bounding = bounding
+        self.radius = radius
+        self.global_feat = global_feat
+        self.ps_dim = 32 if global_feat else 64
+        self.mlp_1 = MLP_CONV(in_channel=3, layer_dims=[64, 128])
+        self.pcd_image_attn = ResidualTransformerBlock(
+            device=torch.device('cuda'),
+            dtype=torch.float32,
+            n_data=128,
+            width=128,
+            heads=8,
+            init_scale=1.0,
+        )
+        self.mlp_2 = MLP_CONV(in_channel=128 * 2 + dim_feat if self.global_feat else 128, layer_dims=[256, 128])
+        self.skip_transformer = SkipTransformer(in_channel=128, dim=64)
+        self.mlp_ps = MLP_CONV(in_channel=128, layer_dims=[64, self.ps_dim])
+        self.ps = nn.ConvTranspose1d(self.ps_dim, 128, up_factor, up_factor, bias=False)   # point-wise splitting
+        self.up_sampler = nn.Upsample(scale_factor=up_factor)
+        self.mlp_delta_feature = MLP_Res(in_dim=256, hidden_dim=128, out_dim=128)
+        self.mlp_delta = MLP_CONV(in_channel=128, layer_dims=[64, 3])
+    def forward(self, pcd_prev, feat_global=None, K_prev=None):
+        """
+        Args:
+            pcd_prev: Tensor, (B, 3, N_prev)
+            feat_global: Tensor, (B, dim_feat, 1)
+            K_prev: Tensor, (B, 128, N_prev)
+        Returns:
+            pcd_child: Tensor, up sampled point cloud, (B, 3, N_prev * up_factor)
+            K_curr: Tensor, displacement feature of current step, (B, 128, N_prev * up_factor)
+        """
+        b, _, n_prev = pcd_prev.shape
+        feat_1 = self.mlp_1(pcd_prev)
+        # feat_1 = torch.cat([feat_1,
+        #                     torch.max(feat_1, 2, keepdim=True)[0].repeat((1, 1, feat_1.size(2))),
+        #                     feat_global.repeat(1, 1, feat_1.size(2))], 1) if self.global_feat else feat_1
+        feat_1 = torch.permute(feat_1, (0, 2, 1))
+        feat_global = torch.permute(feat_global, (0, 2, 1))
+        feat_1 = self.pcd_image_attn(feat_1, feat_global)
+        Q = torch.permute(feat_1, (0, 2, 1))
+        # Q = self.mlp_2(feat_1)
+        H = self.skip_transformer(pcd_prev, K_prev if K_prev is not None else Q, Q)
+        feat_child = self.mlp_ps(H)
+        feat_child = self.ps(feat_child)  # (B, 128, N_prev * up_factor)
+        H_up = self.up_sampler(H)
+        K_curr = self.mlp_delta_feature(torch.cat([feat_child, H_up], 1))
+        delta = self.mlp_delta(torch.relu(K_curr))
+        if self.bounding:
+            delta = torch.tanh(delta) / self.radius**self.i  # (B, 3, N_prev * up_factor)
+        pcd_child = self.up_sampler(pcd_prev)
+        pcd_child = pcd_child + delta
+        return pcd_child, K_curr

hort/models/tgs/models/snowflake/SPD_pp.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+from .utils import MLP_Res, MLP_CONV
+from .skip_transformer import SkipTransformer
+class SPD_pp(nn.Module):
+    def __init__(self, dim_feat=512, up_factor=2, i=0, radius=1, bounding=True, global_feat=True):
+        """Snowflake Point Deconvolution"""
+        super(SPD_pp, self).__init__()
+        self.i = i
+        self.up_factor = up_factor
+        self.bounding = bounding
+        self.radius = radius
+        self.global_feat = global_feat
+        self.ps_dim = 32 if global_feat else 64
+        self.mlp_1 = MLP_CONV(in_channel=3, layer_dims=[64, 128])
+        self.mlp_2 = MLP_CONV(
+            in_channel=128 * 2 + dim_feat if self.global_feat else 128, layer_dims=[256, 128])
+        self.skip_transformer = SkipTransformer(in_channel=128, dim=64)
+        self.mlp_ps = MLP_CONV(in_channel=128, layer_dims=[64, self.ps_dim])
+        self.ps = nn.ConvTranspose1d(
+            self.ps_dim, 128, up_factor, up_factor, bias=False)   # point-wise splitting
+        self.up_sampler = nn.Upsample(scale_factor=up_factor)
+        self.mlp_delta_feature = MLP_Res(
+            in_dim=256, hidden_dim=128, out_dim=128)
+        self.mlp_delta = MLP_CONV(in_channel=128, layer_dims=[64, 3])
+    def forward(self, pcd_prev, feat_cond=None, K_prev=None):
+        """
+        Args:
+            pcd_prev: Tensor, (B, 3, N_prev)
+            feat_cond: Tensor, (B, dim_feat, N_prev)
+            K_prev: Tensor, (B, 128, N_prev)
+        Returns:
+            pcd_child: Tensor, up sampled point cloud, (B, 3, N_prev * up_factor)
+            K_curr: Tensor, displacement feature of current step, (B, 128, N_prev * up_factor)
+        """
+        b, _, n_prev = pcd_prev.shape
+        feat_1 = self.mlp_1(pcd_prev)
+        feat_1 = torch.cat([feat_1,
+                            torch.max(feat_1, 2, keepdim=True)[
+                                0].repeat((1, 1, feat_1.size(2))),
+                            feat_cond], 1) if self.global_feat else feat_1
+        Q = self.mlp_2(feat_1)
+        H = self.skip_transformer(
+                pcd_prev, K_prev if K_prev is not None else Q, Q)
+        feat_child = self.mlp_ps(H)
+        feat_child = self.ps(feat_child)  # (B, 128, N_prev * up_factor)
+        H_up = self.up_sampler(H)
+        K_curr = self.mlp_delta_feature(torch.cat([feat_child, H_up], 1))
+        delta = self.mlp_delta(torch.relu(K_curr))
+        if self.bounding:
+            # (B, 3, N_prev * up_factor)
+            delta = torch.tanh(delta) / self.radius**self.i
+        pcd_child = self.up_sampler(pcd_prev)
+        pcd_child = pcd_child + delta
+        return pcd_child, K_curr

hort/models/tgs/models/snowflake/attention.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import torch
+import torch.nn as nn
+import math
+import math
+from typing import Optional
+from typing import Callable, Iterable, Sequence, Union
+import torch
+def checkpoint(
+    func: Callable[..., Union[torch.Tensor, Sequence[torch.Tensor]]],
+    inputs: Sequence[torch.Tensor],
+    params: Iterable[torch.Tensor],
+    flag: bool,
+):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True)
+                             for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def init_linear(l, stddev):
+    nn.init.normal_(l.weight, std=stddev)
+    if l.bias is not None:
+        nn.init.constant_(l.bias, 0.0)
+class MLP(nn.Module):
+    def __init__(self, *, device: torch.device, dtype: torch.dtype, width: int, init_scale: float):
+        super().__init__()
+        self.width = width
+        self.c_fc = nn.Linear(width, width * 4, device=device, dtype=dtype)
+        self.c_proj = nn.Linear(width * 4, width, device=device, dtype=dtype)
+        self.gelu = nn.GELU()
+        init_linear(self.c_fc, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        return self.c_proj(self.gelu(self.c_fc(x)))
+class QKVMultiheadCrossAttention(nn.Module):
+    def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_data: int):
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.heads = heads
+        self.n_data = n_data
+    def forward(self, q, kv):
+        _, n_ctx, _ = q.shape
+        bs, n_data, width = kv.shape
+        attn_ch = width // self.heads // 2
+        scale = 1 / math.sqrt(math.sqrt(attn_ch))
+        q = q.view(bs, n_ctx, self.heads, -1)
+        kv = kv.view(bs, n_data, self.heads, -1)
+        k, v = torch.split(kv, attn_ch, dim=-1)
+        weight = torch.einsum(
+            "bthc,bshc->bhts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        wdtype = weight.dtype
+        weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
+        return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
+class QKVMultiheadAttention(nn.Module):
+    def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_ctx: int):
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.heads = heads
+        self.n_ctx = n_ctx
+    def forward(self, qkv):
+        bs, n_ctx, width = qkv.shape
+        attn_ch = width // self.heads // 3
+        scale = 1 / math.sqrt(math.sqrt(attn_ch))
+        qkv = qkv.view(bs, n_ctx, self.heads, -1)
+        q, k, v = torch.split(qkv, attn_ch, dim=-1)
+        weight = torch.einsum(
+            "bthc,bshc->bhts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        wdtype = weight.dtype
+        weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
+        return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
+class MultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+        n_data: int,
+        width: int,
+        heads: int,
+        init_scale: float,
+        data_width: Optional[int] = None,
+    ):
+        super().__init__()
+        self.n_data = n_data
+        self.width = width
+        self.heads = heads
+        self.data_width = width if data_width is None else data_width
+        self.c_q = nn.Linear(width, width, device=device, dtype=dtype)
+        self.c_kv = nn.Linear(self.data_width, width * 2,
+                              device=device, dtype=dtype)
+        self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.attention = QKVMultiheadCrossAttention(
+            device=device, dtype=dtype, heads=heads, n_data=n_data
+        )
+        init_linear(self.c_q, init_scale)
+        init_linear(self.c_kv, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x, data):
+        x = self.c_q(x)
+        data = self.c_kv(data)
+        x = checkpoint(self.attention, (x, data), (), True)
+        x = self.c_proj(x)
+        return x
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        init_scale: float,
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.heads = heads
+        self.c_qkv = nn.Linear(width, width * 3, device=device, dtype=dtype)
+        self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
+        self.attention = QKVMultiheadAttention(device=device, dtype=dtype, heads=heads, n_ctx=n_ctx)
+        init_linear(self.c_qkv, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        x = self.c_qkv(x)
+        x = checkpoint(self.attention, (x,), (), True)
+        x = self.c_proj(x)
+        return x
+class ResidualTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+        n_data: int,
+        width: int,
+        heads: int,
+        data_width: Optional[int] = None,
+        init_scale: float = 1.0,
+    ):
+        super().__init__()
+        if data_width is None:
+            data_width = width
+        self.attn_cross = MultiheadCrossAttention(
+            device=device,
+            dtype=dtype,
+            n_data=n_data,
+            width=width,
+            heads=heads,
+            data_width=data_width,
+            init_scale=init_scale,
+        )
+        self.attn_self = MultiheadAttention(
+            device=device,
+            dtype=dtype,
+            n_ctx=n_data,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+        )
+        self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.ln_2 = nn.LayerNorm(data_width, device=device, dtype=dtype)
+        self.ln_3 = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.mlp = MLP(device=device, dtype=dtype,
+                       width=width, init_scale=init_scale)
+        self.ln_4 = nn.LayerNorm(width, device=device, dtype=dtype)
+    def forward(self, x: torch.Tensor, data: torch.Tensor):
+        x = x + self.attn_cross(self.ln_1(x), self.ln_2(data))
+        x = x + self.attn_self(self.ln_3(x))
+        x = x + self.mlp(self.ln_4(x))
+        return x

hort/models/tgs/models/snowflake/model_spdpp.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tgs.utils.base import BaseModule
+from tgs.utils.typing import *
+from dataclasses import dataclass, field
+from pytorch3d.renderer import (
+    AlphaCompositor,
+    NormWeightedCompositor,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+    PointsRenderer)
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from pytorch3d.utils.camera_conversions import cameras_from_opencv_projection
+from .utils import fps_subsample
+from einops import rearrange
+from .utils import MLP_CONV
+from .SPD import SPD
+from .SPD_crossattn import SPD_crossattn
+from .SPD_pp import SPD_pp
+SPD_BLOCK = {
+    'SPD': SPD,
+    'SPD_crossattn': SPD_crossattn,
+    'SPD_PP': SPD_pp,
+}
+def homoify(points):
+    """
+    Convert a batch of points to homogeneous coordinates.
+    Args:
+        points: e.g. (B, N, 3) or (N, 3)
+    Returns:
+        homoified points: e.g., (B, N, 4)
+    """
+    points_dim = points.shape[:-1] + (1,)
+    ones = points.new_ones(points_dim)
+    return torch.cat([points, ones], dim=-1)
+def dehomoify(points):
+    """
+    Convert a batch of homogeneous points to cartesian coordinates.
+    Args:
+        homogeneous points: (B, N, 4/3) or (N, 4/3)
+    Returns:
+        cartesian points: (B, N, 3/2)
+    """
+    return points[..., :-1] / points[..., -1:]
+def mask_generation(points: Float[Tensor, "B Np 3"],
+                      intrinsics: Float[Tensor, "B 3 3"],
+                      input_img: Float[Tensor, "B C H W"],
+                      raster_point_radius: float = 0.01,  # point size
+                      raster_points_per_pixel: int = 1,  # a single point per pixel, for now
+                      bin_size: int = 0):
+    """
+    points: (B, Np, 3)
+    """
+    B, C, H, W = input_img.shape
+    device = intrinsics.device
+    cam_R = torch.eye(3).to(device).unsqueeze(0).repeat(B, 1, 1)
+    cam_t = torch.zeros(3).to(device).unsqueeze(0).repeat(B, 1)
+    raster_settings = PointsRasterizationSettings(image_size=(H, W), radius=raster_point_radius, points_per_pixel=raster_points_per_pixel, bin_size=bin_size)
+    image_size = torch.as_tensor([H, W]).view(1, 2).expand(B, -1).to(device)
+    cameras = cameras_from_opencv_projection(cam_R, cam_t, intrinsics, image_size)
+    rasterize = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+    fragments = rasterize(Pointclouds(points))
+    fragments_idx: Tensor = fragments.idx.long()
+    mask = (fragments_idx[..., 0] > -1)
+    return mask.float()
+def points_projection(points: Float[Tensor, "B Np 3"],
+                      intrinsics: Float[Tensor, "B 3 3"],
+                      local_features: Float[Tensor, "B C H W"],
+                      raster_point_radius: float = 0.0075,  # point size
+                      raster_points_per_pixel: int = 1,  # a single point per pixel, for now
+                      bin_size: int = 0):
+    """
+    points: (B, Np, 3)
+    """
+    B, C, H, W = local_features.shape
+    device = local_features.device
+    cam_R = torch.eye(3).to(device).unsqueeze(0).repeat(B, 1, 1)
+    cam_t = torch.zeros(3).to(device).unsqueeze(0).repeat(B, 1)
+    raster_settings = PointsRasterizationSettings(image_size=(H, W), radius=raster_point_radius, points_per_pixel=raster_points_per_pixel, bin_size=bin_size)
+    Np = points.shape[1]
+    R = raster_settings.points_per_pixel
+    image_size = torch.as_tensor([H, W]).view(1, 2).expand(B, -1).to(device)
+    cameras = cameras_from_opencv_projection(cam_R, cam_t, intrinsics, image_size)
+    rasterize = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+    fragments = rasterize(Pointclouds(points))
+    fragments_idx: Tensor = fragments.idx.long()
+    visible_pixels = (fragments_idx > -1)  # (B, H, W, R)
+    points_to_visible_pixels = fragments_idx[visible_pixels]
+    # Reshape local features to (B, H, W, R, C)
+    local_features = local_features.permute(0, 2, 3, 1).unsqueeze(-2).expand(-1, -1, -1, R, -1)  # (B, H, W, R, C)
+    # Get local features corresponding to visible points
+    local_features_proj = torch.zeros(B * Np, C, device=device)
+    local_features_proj[points_to_visible_pixels] = local_features[visible_pixels]
+    local_features_proj = local_features_proj.reshape(B, Np, C)
+    return local_features_proj
+def points_projection_v2(input_xyz_points, cam_intr, feature_maps):
+    input_points = input_xyz_points.clone()
+    batch_size = input_points.shape[0]
+    xyz = input_points[:, :, :3]
+    homo_xyz = homoify(xyz)
+    homo_xyz_2d = torch.matmul(cam_intr, homo_xyz.transpose(1, 2)).transpose(1, 2)
+    xyz_2d = (homo_xyz_2d[:, :, :2] / homo_xyz_2d[:, :, [2]]).unsqueeze(2)
+    uv_2d = xyz_2d / 224 * 2 - 1
+    sample_feat = torch.nn.functional.grid_sample(feature_maps, uv_2d, align_corners=True)[:, :, :, 0].transpose(1, 2)
+    uv_2d = uv_2d.squeeze(2).reshape((-1, 2))
+    validity = (uv_2d[:, 0] >= -1.0) & (uv_2d[:, 0] <= 1.0) & (uv_2d[:, 1] >= -1.0) & (uv_2d[:, 1] <= 1.0)
+    validity = validity.unsqueeze(1)
+    return sample_feat
+class Decoder(nn.Module):
+    def __init__(self, input_channels=1152, dim_feat=512, num_p0=512,
+                 radius=1, bounding=True, up_factors=None,
+                 SPD_type='SPD',
+                 token_type='image_token'
+                 ):
+        super(Decoder, self).__init__()
+        # self.decoder_coarse = SeedGenerator(dim_feat=dim_feat, num_pc=num_p0)
+        if up_factors is None:
+            up_factors = [1]
+        else:
+            up_factors = up_factors
+        uppers = []
+        self.num_p0 = num_p0
+        self.mlp_feat_cond = MLP_CONV(in_channel=input_channels,
+                                      layer_dims=[dim_feat*2, dim_feat])
+        for i, factor in enumerate(up_factors):
+            uppers.append(
+                SPD_BLOCK[SPD_type](dim_feat=dim_feat, up_factor=factor,
+                                    i=i, bounding=bounding, radius=radius))
+        self.uppers = nn.ModuleList(uppers)
+        self.token_type = token_type
+    def calculate_pcl_token(self, pcl_token, up_factor):
+        up_token =  F.interpolate(pcl_token, scale_factor=up_factor, mode='nearest')
+        return up_token
+    def calculate_image_token(self, pcd, input_image_tokens, batch):
+        """
+        Args:
+        """
+        batch_size = input_image_tokens.shape[0]
+        h_cond, w_cond = 224, 224
+        input_image_tokens = input_image_tokens.permute(0, 2, 1)
+        local_features = input_image_tokens[:, 1:].reshape(batch_size, h_cond // 14, w_cond // 14, -1).permute(0, 3, 1, 2)
+        # local_features = F.interpolate(local_features, size=(h_cond, w_cond), mode='bilinear', align_corners=False)
+        local_features_proj = points_projection_v2(pcd * batch['scale'] + batch['trans'].unsqueeze(1), batch['intrinsic_cond'], local_features)
+        local_features_proj = local_features_proj.permute(0, 2, 1).contiguous()
+        return local_features_proj
+    def forward(self, x):
+        """
+        Args:
+            points: Tensor, (b, num_p0, 3)
+            feat_cond: Tensor, (b, dim_feat) dinov2: 325x768
+            # partial_coarse: Tensor, (b, n_coarse, 3)
+        """
+        points = x['points']
+        if self.token_type == 'pcl_token':
+            feat_cond = x['pcl_token']
+        elif self.token_type == 'image_token':
+            feat_cond = x['input_image_tokens']
+        feat_cond = self.mlp_feat_cond(feat_cond)
+        arr_pcd = []
+        feat_prev = None
+        pcd = torch.permute(points, (0, 2, 1)).contiguous()
+        pcl_up_scale = 1
+        for upper in self.uppers:
+            if self.token_type == 'pcl_token':
+                up_cond = self.calculate_pcl_token(
+                    feat_cond, pcl_up_scale)
+                pcl_up_scale *= upper.up_factor
+            elif self.token_type == 'image_token':
+                up_cond = self.calculate_image_token(points, feat_cond, x)
+            pcd, feat_prev = upper(pcd, up_cond, feat_prev)
+            points = torch.permute(pcd, (0, 2, 1)).contiguous()
+            arr_pcd.append(points)
+        return arr_pcd
+class SnowflakeModelSPDPP(BaseModule):
+    """
+    apply PC^2 / PCL token to decoder
+    """
+    @dataclass
+    class Config(BaseModule.Config):
+        input_channels: int = 1152
+        dim_feat: int = 128
+        num_p0: int = 512
+        radius: float = 1
+        bounding: bool = True
+        use_fps: bool = True
+        up_factors: List[int] = field(default_factory=lambda: [2, 2])
+        image_full_token_cond: bool = False
+        SPD_type: str = 'SPD_PP'
+        token_type: str = 'pcl_token'
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.decoder = Decoder(input_channels=self.cfg.input_channels,
+                               dim_feat=self.cfg.dim_feat, num_p0=self.cfg.num_p0,
+                               radius=self.cfg.radius, up_factors=self.cfg.up_factors, bounding=self.cfg.bounding,
+                               SPD_type=self.cfg.SPD_type,
+                               token_type=self.cfg.token_type
+                               )
+    def forward(self, x):
+        results = self.decoder(x)
+        return results

hort/models/tgs/models/snowflake/pointnet2.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pointnet2_ops.pointnet2_modules import PointnetFPModule, PointnetSAModule
+class PointNet2ClassificationSSG(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._build_model()
+    def _build_model(self):
+        self.SA_modules = nn.ModuleList()
+        self.SA_modules.append(
+            PointnetSAModule(
+                npoint=512,
+                radius=0.2,
+                nsample=64,
+                mlp=[3, 64, 64, 128],
+                use_xyz=True,
+            )
+        )
+        self.SA_modules.append(
+            PointnetSAModule(
+                npoint=128,
+                radius=0.4,
+                nsample=64,
+                mlp=[128, 128, 128, 256],
+                use_xyz=True,
+            )
+        )
+        self.SA_modules.append(
+            PointnetSAModule(
+                mlp=[256, 256, 512, 1024], use_xyz=True,
+            )
+        )
+        self.fc_layer = nn.Sequential(
+            nn.Linear(1024, 512, bias=False),
+            nn.BatchNorm1d(512),
+            nn.ReLU(True),
+            nn.Linear(512, 256, bias=False),
+            nn.BatchNorm1d(256),
+            nn.ReLU(True),
+            nn.Dropout(0.5),
+            nn.Linear(256, 40),
+        )
+    def _break_up_pc(self, pc):
+        xyz = pc[..., 0:3].contiguous()
+        features = pc[..., 3:].transpose(1, 2).contiguous() if pc.size(-1) > 3 else None
+        return xyz, features
+    def forward(self, pointcloud):
+        r"""
+            Forward pass of the network
+            Parameters
+            ----------
+            pointcloud: Variable(torch.cuda.FloatTensor)
+                (B, N, 3 + input_channels) tensor
+                Point cloud to run predicts on
+                Each point in the point-cloud MUST
+                be formated as (x, y, z, features...)
+        """
+        xyz, features = self._break_up_pc(pointcloud)
+        for module in self.SA_modules:
+            xyz, features = module(xyz, features)
+        return self.fc_layer(features.squeeze(-1))
+class PointNet2SemSegSSG(PointNet2ClassificationSSG):
+    def _build_model(self):
+        self.SA_modules = nn.ModuleList()
+        self.SA_modules.append(
+            PointnetSAModule(
+                npoint=256,
+                radius=0.05,
+                nsample=32,
+                mlp=[1, 32, 64],
+                use_xyz=True,
+            )
+        )
+        self.SA_modules.append(
+            PointnetSAModule(
+                npoint=64,
+                radius=0.10,
+                nsample=32,
+                mlp=[64, 128, 256],
+                use_xyz=True,
+            )
+        )
+        self.SA_modules.append(
+            PointnetSAModule(
+                npoint=16,
+                radius=0.20,
+                nsample=32,
+                mlp=[256, 512, 768],
+                use_xyz=True,
+            )
+        )
+    def forward(self, pointcloud):
+        r"""
+            Forward pass of the network
+            Parameters
+            ----------
+            pointcloud: Variable(torch.cuda.FloatTensor)
+                (B, N, 3 + input_channels) tensor
+                Point cloud to run predicts on
+                Each point in the point-cloud MUST
+                be formated as (x, y, z, features...)
+        """
+        xyz, features = self._break_up_pc(pointcloud)
+        l_xyz, l_features = [xyz], [features]
+        for i in range(len(self.SA_modules)):
+            li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i])
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+        return l_features[-1].transpose(2, 1)

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+import pointnet2_ops.pointnet2_modules
+import pointnet2_ops.pointnet2_utils
+from pointnet2_ops._version import __version__

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/ball_query.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/extension.h>
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample);

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/cuda_utils.h ADDED Viewed

	@@ -0,0 +1,41 @@

+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#define TOTAL_THREADS 512
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+inline dim3 opt_block_config(int x, int y) {
+  const int x_threads = opt_n_threads(x);
+  const int y_threads =
+      max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+  dim3 block_config(x_threads, y_threads, 1);
+  return block_config;
+}
+#define CUDA_CHECK_ERRORS()                                           \
+  do {                                                                \
+    cudaError_t err = cudaGetLastError();                             \
+    if (cudaSuccess != err) {                                         \
+      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
+              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
+              __FILE__);                                              \
+      exit(-1);                                                       \
+    }                                                                 \
+  } while (0)
+#endif

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/group_points.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/extension.h>
+at::Tensor group_points(at::Tensor points, at::Tensor idx);
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/interpolate.h ADDED Viewed

	@@ -0,0 +1,10 @@

+#pragma once
+#include <torch/extension.h>
+#include <vector>
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight);
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m);

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/sampling.h ADDED Viewed

	@@ -0,0 +1,6 @@

+#pragma once
+#include <torch/extension.h>
+at::Tensor gather_points(at::Tensor points, at::Tensor idx);
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/include/utils.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#define CHECK_CUDA(x)                                    \
+  do {                                                   \
+    AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \
+  } while (0)
+#define CHECK_CONTIGUOUS(x)                                          \
+  do {                                                               \
+    AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
+  } while (0)
+#define CHECK_IS_INT(x)                               \
+  do {                                                \
+    AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \
+              #x " must be an int tensor");           \
+  } while (0)
+#define CHECK_IS_FLOAT(x)                               \
+  do {                                                  \
+    AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \
+              #x " must be a float tensor");            \
+  } while (0)

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/ball_query.cpp ADDED Viewed

	@@ -0,0 +1,32 @@

+#include "ball_query.h"
+#include "utils.h"
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx);
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+  if (new_xyz.is_cuda()) {
+    CHECK_CUDA(xyz);
+  }
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+  if (new_xyz.is_cuda()) {
+    query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                                    radius, nsample, new_xyz.data_ptr<float>(),
+                                    xyz.data_ptr<float>(), idx.data_ptr<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return idx;
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/ball_query_gpu.cu ADDED Viewed

	@@ -0,0 +1,54 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+// input: new_xyz(b, m, 3) xyz(b, n, 3)
+// output: idx(b, m, nsample)
+__global__ void query_ball_point_kernel(int b, int n, int m, float radius,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  idx += m * nsample * batch_index;
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0];
+      float y = xyz[k * 3 + 1];
+      float z = xyz[k * 3 + 2];
+      float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+                 (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_ball_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, nsample, new_xyz, xyz, idx);
+  //CUDA_CHECK_ERRORS();
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,19 @@

+#include "ball_query.h"
+#include "group_points.h"
+#include "interpolate.h"
+#include "sampling.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gather_points", &gather_points);
+  m.def("gather_points_grad", &gather_points_grad);
+  m.def("furthest_point_sampling", &furthest_point_sampling);
+  m.def("three_nn", &three_nn);
+  m.def("three_interpolate", &three_interpolate);
+  m.def("three_interpolate_grad", &three_interpolate_grad);
+  m.def("ball_query", &ball_query);
+  m.def("group_points", &group_points);
+  m.def("group_points_grad", &group_points_grad);
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/group_points.cpp ADDED Viewed

	@@ -0,0 +1,62 @@

+#include "group_points.h"
+#include "utils.h"
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out);
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points);
+at::Tensor group_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+  if (points.is_cuda()) {
+    group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                idx.size(1), idx.size(2),
+                                points.data_ptr<float>(), idx.data_ptr<int>(),
+                                output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+  if (grad_out.is_cuda()) {
+    group_points_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2),
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(),
+        output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/group_points_gpu.cu ADDED Viewed

	@@ -0,0 +1,75 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+// input: points(b, c, n) idx(b, npoints, nsample)
+// output: out(b, c, npoints, nsample)
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * n * c;
+  idx += batch_index * npoints * nsample;
+  out += batch_index * npoints * nsample * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      out[(l * npoints + j) * nsample + k] = points[l * n + ii];
+    }
+  }
+}
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  group_points_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, points, idx, out);
+  //CUDA_CHECK_ERRORS();
+}
+// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
+// output: grad_points(b, c, n)
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
+                                         int nsample,
+                                         const float *__restrict__ grad_out,
+                                         const int *__restrict__ idx,
+                                         float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * npoints * nsample * c;
+  idx += batch_index * npoints * nsample;
+  grad_points += batch_index * n * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      atomicAdd(grad_points + l * n + ii,
+                grad_out[(l * npoints + j) * nsample + k]);
+    }
+  }
+}
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  group_points_grad_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
+  //CUDA_CHECK_ERRORS();
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/interpolate.cpp ADDED Viewed

	@@ -0,0 +1,99 @@

+#include "interpolate.h"
+#include "utils.h"
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx);
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out);
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points);
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows) {
+  CHECK_CONTIGUOUS(unknowns);
+  CHECK_CONTIGUOUS(knows);
+  CHECK_IS_FLOAT(unknowns);
+  CHECK_IS_FLOAT(knows);
+  if (unknowns.is_cuda()) {
+    CHECK_CUDA(knows);
+  }
+  at::Tensor idx =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
+  at::Tensor dist2 =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Float));
+  if (unknowns.is_cuda()) {
+    three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
+                            unknowns.data_ptr<float>(), knows.data_ptr<float>(),
+                            dist2.data_ptr<float>(), idx.data_ptr<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return {dist2, idx};
+}
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+  if (points.is_cuda()) {
+    three_interpolate_kernel_wrapper(
+        points.size(0), points.size(1), points.size(2), idx.size(1),
+        points.data_ptr<float>(), idx.data_ptr<int>(), weight.data_ptr<float>(),
+        output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), m},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+  if (grad_out.is_cuda()) {
+    three_interpolate_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(),
+        weight.data_ptr<float>(), output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/interpolate_gpu.cu ADDED Viewed

	@@ -0,0 +1,154 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+// input: unknown(b, n, 3) known(b, m, 3)
+// output: dist2(b, n, 3), idx(b, n, 3)
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  unknown += batch_index * n * 3;
+  known += batch_index * m * 3;
+  dist2 += batch_index * n * 3;
+  idx += batch_index * n * 3;
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  for (int j = index; j < n; j += stride) {
+    float ux = unknown[j * 3 + 0];
+    float uy = unknown[j * 3 + 1];
+    float uz = unknown[j * 3 + 2];
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      float x = known[k * 3 + 0];
+      float y = known[k * 3 + 1];
+      float z = known[k * 3 + 2];
+      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[j * 3 + 0] = best1;
+    dist2[j * 3 + 1] = best2;
+    dist2[j * 3 + 2] = best3;
+    idx[j * 3 + 0] = besti1;
+    idx[j * 3 + 1] = besti2;
+    idx[j * 3 + 2] = besti3;
+  }
+}
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_nn_kernel<<<b, opt_n_threads(n), 0, stream>>>(b, n, m, unknown, known,
+                                                      dist2, idx);
+  //CUDA_CHECK_ERRORS();
+}
+// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
+// output: out(b, c, n)
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
+                                         const float *__restrict__ points,
+                                         const int *__restrict__ idx,
+                                         const float *__restrict__ weight,
+                                         float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * m * c;
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+  out += batch_index * n * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+    out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 +
+             points[l * m + i3] * w3;
+  }
+}
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, m, n, points, idx, weight, out);
+  //CUDA_CHECK_ERRORS();
+}
+// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
+// output: grad_points(b, c, m)
+__global__ void three_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight,
+    float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * n * c;
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+  grad_points += batch_index * m * c;
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+    atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
+    atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
+    atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
+  }
+}
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_grad_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, n, m, grad_out, idx, weight, grad_points);
+  CUDA_CHECK_ERRORS();
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/sampling.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+#include "sampling.h"
+#include "utils.h"
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out);
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points);
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs);
+at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+  if (points.is_cuda()) {
+    gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                 idx.size(1), points.data_ptr<float>(),
+                                 idx.data_ptr<int>(), output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
+                              const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+  if (grad_out.is_cuda()) {
+    gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n,
+                                      idx.size(1), grad_out.data_ptr<float>(),
+                                      idx.data_ptr<int>(),
+                                      output.data_ptr<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_IS_FLOAT(points);
+  at::Tensor output =
+      torch::zeros({points.size(0), nsamples},
+                   at::device(points.device()).dtype(at::ScalarType::Int));
+  at::Tensor tmp =
+      torch::full({points.size(0), points.size(1)}, 1e10,
+                  at::device(points.device()).dtype(at::ScalarType::Float));
+  if (points.is_cuda()) {
+    furthest_point_sampling_kernel_wrapper(
+        points.size(0), points.size(1), nsamples, points.data_ptr<float>(),
+        tmp.data_ptr<float>(), output.data_ptr<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+  return output;
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_ext-src/src/sampling_gpu.cu ADDED Viewed

	@@ -0,0 +1,229 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+// input: points(b, c, n) idx(b, m)
+// output: out(b, c, m)
+__global__ void gather_points_kernel(int b, int c, int n, int m,
+                                     const float *__restrict__ points,
+                                     const int *__restrict__ idx,
+                                     float *__restrict__ out) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
+      }
+    }
+  }
+}
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out) {
+  gather_points_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                         at::cuda::getCurrentCUDAStream()>>>(b, c, n, npoints,
+                                                             points, idx, out);
+  //CUDA_CHECK_ERRORS();
+}
+// input: grad_out(b, c, m) idx(b, m)
+// output: grad_points(b, c, n)
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
+                                          const float *__restrict__ grad_out,
+                                          const int *__restrict__ idx,
+                                          float *__restrict__ grad_points) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        atomicAdd(grad_points + (i * c + l) * n + a,
+                  grad_out[(i * c + l) * m + j]);
+      }
+    }
+  }
+}
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points) {
+  gather_points_grad_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                              at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, npoints, grad_out, idx, grad_points);
+  //CUDA_CHECK_ERRORS();
+}
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+// Input dataset: (b, n, 3), tmp: (b, n)
+// Ouput idxs (b, m)
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+  int tid = threadIdx.x;
+  const int stride = block_size;
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      if (mag <= 1e-3) continue;
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+    if (block_size >= 512) {
+      if (tid < 256) {
+        __update(dists, dists_i, tid, tid + 256);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 256) {
+      if (tid < 128) {
+        __update(dists, dists_i, tid, tid + 128);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 128) {
+      if (tid < 64) {
+        __update(dists, dists_i, tid, tid + 64);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 64) {
+      if (tid < 32) {
+        __update(dists, dists_i, tid, tid + 32);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 32) {
+      if (tid < 16) {
+        __update(dists, dists_i, tid, tid + 16);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 16) {
+      if (tid < 8) {
+        __update(dists, dists_i, tid, tid + 8);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 8) {
+      if (tid < 4) {
+        __update(dists, dists_i, tid, tid + 4);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 4) {
+      if (tid < 2) {
+        __update(dists, dists_i, tid, tid + 2);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 2) {
+      if (tid < 1) {
+        __update(dists, dists_i, tid, tid + 1);
+      }
+      __syncthreads();
+    }
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs) {
+  unsigned int n_threads = opt_n_threads(n);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (n_threads) {
+    case 512:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+  //CUDA_CHECK_ERRORS();
+}

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/_version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "3.0.0"

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/pointnet2_modules.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pointnet2_ops import pointnet2_utils
+def build_shared_mlp(mlp_spec: List[int], bn: bool = True):
+    layers = []
+    for i in range(1, len(mlp_spec)):
+        layers.append(
+            nn.Conv2d(mlp_spec[i - 1], mlp_spec[i], kernel_size=1, bias=not bn)
+        )
+        if bn:
+            layers.append(nn.BatchNorm2d(mlp_spec[i]))
+        layers.append(nn.ReLU(True))
+    return nn.Sequential(*layers)
+class _PointnetSAModuleBase(nn.Module):
+    def __init__(self):
+        super(_PointnetSAModuleBase, self).__init__()
+        self.npoint = None
+        self.groupers = None
+        self.mlps = None
+    def forward(
+        self, xyz: torch.Tensor, features: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, N) tensor of the descriptors of the the features
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B,  \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        """
+        new_features_list = []
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        new_xyz = (
+            pointnet2_utils.gather_operation(
+                xyz_flipped, pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+            )
+            .transpose(1, 2)
+            .contiguous()
+            if self.npoint is not None
+            else None
+        )
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+            new_features = self.mlps[i](new_features)  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+            new_features_list.append(new_features)
+        return new_xyz, torch.cat(new_features_list, dim=1)
+class PointnetSAModuleMSG(_PointnetSAModuleBase):
+    r"""Pointnet set abstrction layer with multiscale grouping
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radii : list of float32
+        list of radii to group with
+    nsamples : list of int32
+        Number of samples in each ball query
+    mlps : list of list of int32
+        Spec of the pointnet before the global max_pool for each scale
+    bn : bool
+        Use batchnorm
+    """
+    def __init__(self, npoint, radii, nsamples, mlps, bn=True, use_xyz=True):
+        # type: (PointnetSAModuleMSG, int, List[float], List[int], List[List[int]], bool, bool) -> None
+        super(PointnetSAModuleMSG, self).__init__()
+        assert len(radii) == len(nsamples) == len(mlps)
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
+                if npoint is not None
+                else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+            self.mlps.append(build_shared_mlp(mlp_spec, bn))
+class PointnetSAModule(PointnetSAModuleMSG):
+    r"""Pointnet set abstrction layer
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radius : float
+        Radius of ball
+    nsample : int
+        Number of samples in the ball query
+    mlp : list
+        Spec of the pointnet before the global max_pool
+    bn : bool
+        Use batchnorm
+    """
+    def __init__(
+        self, mlp, npoint=None, radius=None, nsample=None, bn=True, use_xyz=True
+    ):
+        # type: (PointnetSAModule, List[int], int, float, int, bool, bool) -> None
+        super(PointnetSAModule, self).__init__(
+            mlps=[mlp],
+            npoint=npoint,
+            radii=[radius],
+            nsamples=[nsample],
+            bn=bn,
+            use_xyz=use_xyz,
+        )
+class PointnetFPModule(nn.Module):
+    r"""Propigates the features of one set to another
+    Parameters
+    ----------
+    mlp : list
+        Pointnet module parameters
+    bn : bool
+        Use batchnorm
+    """
+    def __init__(self, mlp, bn=True):
+        # type: (PointnetFPModule, List[int], bool) -> None
+        super(PointnetFPModule, self).__init__()
+        self.mlp = build_shared_mlp(mlp, bn=bn)
+    def forward(self, unknown, known, unknow_feats, known_feats):
+        # type: (PointnetFPModule, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of the xyz positions of the unknown features
+        known : torch.Tensor
+            (B, m, 3) tensor of the xyz positions of the known features
+        unknow_feats : torch.Tensor
+            (B, C1, n) tensor of the features to be propigated to
+        known_feats : torch.Tensor
+            (B, C2, m) tensor of features to be propigated
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, mlp[-1], n) tensor of the features of the unknown features
+        """
+        if known is not None:
+            dist, idx = pointnet2_utils.three_nn(unknown, known)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+            interpolated_feats = pointnet2_utils.three_interpolate(
+                known_feats, idx, weight
+            )
+        else:
+            interpolated_feats = known_feats.expand(
+                *(known_feats.size()[0:2] + [unknown.size(1)])
+            )
+        if unknow_feats is not None:
+            new_features = torch.cat(
+                [interpolated_feats, unknow_feats], dim=1
+            )  # (B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlp(new_features)
+        return new_features.squeeze(-1)

hort/models/tgs/models/snowflake/pointnet2_ops_lib/pointnet2_ops/pointnet2_utils.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import torch
+import torch.nn as nn
+import warnings
+from torch.autograd import Function
+from typing import *
+try:
+    import pointnet2_ops._ext as _ext
+except ImportError:
+    from torch.utils.cpp_extension import load
+    import glob
+    import os.path as osp
+    import os
+    warnings.warn("Unable to load pointnet2_ops cpp extension. JIT Compiling.")
+    _ext_src_root = osp.join(osp.dirname(__file__), "_ext-src")
+    _ext_sources = glob.glob(osp.join(_ext_src_root, "src", "*.cpp")) + glob.glob(
+        osp.join(_ext_src_root, "src", "*.cu")
+    )
+    _ext_headers = glob.glob(osp.join(_ext_src_root, "include", "*"))
+    os.environ["TORCH_CUDA_ARCH_LIST"] = "3.7+PTX;5.0;6.0;6.1;6.2;7.0;7.5"
+    _ext = load(
+        "_ext",
+        sources=_ext_sources,
+        extra_include_paths=[osp.join(_ext_src_root, "include")],
+        extra_cflags=["-O3"],
+        extra_cuda_cflags=["-O3", "-Xfatbin", "-compress-all"],
+        with_cuda=True,
+    )
+class FurthestPointSampling(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, xyz, npoint):
+        # type: (Any, torch.Tensor, int) -> torch.Tensor
+        r"""
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor where N > npoint
+        npoint : int32
+            number of features in the sampled set
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint) tensor containing the set
+        """
+        out = _ext.furthest_point_sampling(xyz, npoint)
+        ctx.mark_non_differentiable(out)
+        return out
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_out):
+        return ()
+furthest_point_sample = FurthestPointSampling.apply
+class GatherOperation(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor
+        idx : torch.Tensor
+            (B, npoint) tensor of the features to gather
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint) tensor
+        """
+        ctx.save_for_backward(idx, features)
+        return _ext.gather_points(features, idx)
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_out):
+        idx, features = ctx.saved_tensors
+        N = features.size(2)
+        grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N)
+        return grad_features, None
+gather_operation = GatherOperation.apply
+class ThreeNN(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, unknown, known):
+        # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+            Find the three nearest neighbors of unknown in known
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of known features
+        known : torch.Tensor
+            (B, m, 3) tensor of unknown features
+        Returns
+        -------
+        dist : torch.Tensor
+            (B, n, 3) l2 distance to the three nearest neighbors
+        idx : torch.Tensor
+            (B, n, 3) index of 3 nearest neighbors
+        """
+        dist2, idx = _ext.three_nn(unknown, known)
+        dist = torch.sqrt(dist2)
+        ctx.mark_non_differentiable(dist, idx)
+        return dist, idx
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_dist, grad_idx):
+        return ()
+three_nn = ThreeNN.apply
+class ThreeInterpolate(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, features, idx, weight):
+        # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor
+        r"""
+            Performs weight linear interpolation on 3 features
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, c, m) Features descriptors to be interpolated from
+        idx : torch.Tensor
+            (B, n, 3) three nearest neighbors of the target features in features
+        weight : torch.Tensor
+            (B, n, 3) weights
+        Returns
+        -------
+        torch.Tensor
+            (B, c, n) tensor of the interpolated features
+        """
+        ctx.save_for_backward(idx, weight, features)
+        return _ext.three_interpolate(features, idx, weight)
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_out):
+        # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, c, n) tensor with gradients of ouputs
+        Returns
+        -------
+        grad_features : torch.Tensor
+            (B, c, m) tensor with gradients of features
+        None
+        None
+        """
+        idx, weight, features = ctx.saved_tensors
+        m = features.size(2)
+        grad_features = _ext.three_interpolate_grad(
+            grad_out.contiguous(), idx, weight, m
+        )
+        return grad_features, torch.zeros_like(idx), torch.zeros_like(weight)
+three_interpolate = ThreeInterpolate.apply
+class GroupingOperation(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor of features to group
+        idx : torch.Tensor
+            (B, npoint, nsample) tensor containing the indicies of features to group with
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint, nsample) tensor
+        """
+        ctx.save_for_backward(idx, features)
+        return _ext.group_points(features, idx)
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_out):
+        # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, C, npoint, nsample) tensor of the gradients of the output from forward
+        Returns
+        -------
+        torch.Tensor
+            (B, C, N) gradient of the features
+        None
+        """
+        idx, features = ctx.saved_tensors
+        N = features.size(2)
+        grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N)
+        return grad_features, torch.zeros_like(idx)
+grouping_operation = GroupingOperation.apply
+class BallQuery(Function):
+    @staticmethod
+    @torch.amp.custom_fwd(cast_inputs=torch.float32, device_type="cuda")
+    def forward(ctx, radius, nsample, xyz, new_xyz):
+        # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+        Parameters
+        ----------
+        radius : float
+            radius of the balls
+        nsample : int
+            maximum number of features in the balls
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the ball query
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        output = _ext.ball_query(new_xyz, xyz, radius, nsample)
+        ctx.mark_non_differentiable(output)
+        return output
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad_out):
+        return ()
+ball_query = BallQuery.apply
+class QueryAndGroup(nn.Module):
+    r"""
+    Groups with a ball query of radius
+    Parameters
+    ---------
+    radius : float32
+        Radius of ball
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+    def __init__(self, radius, nsample, use_xyz=True):
+        # type: (QueryAndGroup, float, int, bool) -> None
+        super(QueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+        return new_features
+class GroupAll(nn.Module):
+    r"""
+    Groups all features
+    Parameters
+    ---------
+    """
+    def __init__(self, use_xyz=True):
+        # type: (GroupAll, bool) -> None
+        super(GroupAll, self).__init__()
+        self.use_xyz = use_xyz
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            Ignored
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, C + 3, 1, N) tensor
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+        return new_features

hort/models/tgs/models/snowflake/pointnet2_ops_lib/setup.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import glob
+import os
+import os.path as osp
+from setuptools import find_packages, setup
+import torch
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+this_dir = osp.dirname(osp.abspath(__file__))
+_ext_src_root = osp.join("pointnet2_ops", "_ext-src")
+_ext_sources = glob.glob(osp.join(_ext_src_root, "src", "*.cpp")) + glob.glob(
+    osp.join(_ext_src_root, "src", "*.cu")
+)
+_ext_headers = glob.glob(osp.join(_ext_src_root, "include", "*"))
+requirements = ["torch>=1.4"]
+exec(open(osp.join("pointnet2_ops", "_version.py")).read())
+# os.environ["TORCH_CUDA_ARCH_LIST"] = ".".join(map(str, torch.cuda.get_device_capability()))
+os.environ["TORCH_CUDA_ARCH_LIST"] = "5.0;6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0"
+setup(
+    name="pointnet2_ops",
+    version=__version__,
+    author="Erik Wijmans",
+    packages=find_packages(),
+    install_requires=requirements,
+    ext_modules=[
+        CUDAExtension(
+            name="pointnet2_ops._ext",
+            sources=_ext_sources,
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": ["-O3", "-Xfatbin", "-compress-all"],
+            },
+            include_dirs=[osp.join(this_dir, _ext_src_root, "include")],
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+    include_package_data=True,
+)

hort/models/tgs/models/snowflake/skip_transformer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# -*- coding: utf-8 -*-
+# @Author: Peng Xiang
+import torch
+from torch import nn, einsum
+from .utils import MLP_Res, grouping_operation, query_knn
+class SkipTransformer(nn.Module):
+    def __init__(self, in_channel, dim=256, n_knn=16, pos_hidden_dim=64, attn_hidden_multiplier=4):
+        super(SkipTransformer, self).__init__()
+        self.mlp_v = MLP_Res(in_dim=in_channel*2, hidden_dim=in_channel, out_dim=in_channel)
+        self.n_knn = n_knn
+        self.conv_key = nn.Conv1d(in_channel, dim, 1)
+        self.conv_query = nn.Conv1d(in_channel, dim, 1)
+        self.conv_value = nn.Conv1d(in_channel, dim, 1)
+        self.pos_mlp = nn.Sequential(
+            nn.Conv2d(3, pos_hidden_dim, 1),
+            nn.BatchNorm2d(pos_hidden_dim),
+            nn.ReLU(),
+            nn.Conv2d(pos_hidden_dim, dim, 1)
+        )
+        self.attn_mlp = nn.Sequential(
+            nn.Conv2d(dim, dim * attn_hidden_multiplier, 1),
+            nn.BatchNorm2d(dim * attn_hidden_multiplier),
+            nn.ReLU(),
+            nn.Conv2d(dim * attn_hidden_multiplier, dim, 1)
+        )
+        self.conv_end = nn.Conv1d(dim, in_channel, 1)
+    def forward(self, pos, key, query, include_self=True):
+        """
+        Args:
+            pos: (B, 3, N)
+            key: (B, in_channel, N)
+            query: (B, in_channel, N)
+            include_self: boolean
+        Returns:
+            Tensor: (B, in_channel, N), shape context feature
+        """
+        value = self.mlp_v(torch.cat([key, query], 1))
+        identity = value
+        key = self.conv_key(key)
+        query = self.conv_query(query)
+        value = self.conv_value(value)
+        b, dim, n = value.shape
+        pos_flipped = pos.permute(0, 2, 1).contiguous()
+        idx_knn = query_knn(self.n_knn, pos_flipped, pos_flipped, include_self=include_self)
+        key = grouping_operation(key, idx_knn)  # b, dim, n, n_knn
+        qk_rel = query.reshape((b, -1, n, 1)) - key
+        pos_rel = pos.reshape((b, -1, n, 1)) - grouping_operation(pos, idx_knn)  # b, 3, n, n_knn
+        pos_embedding = self.pos_mlp(pos_rel)
+        attention = self.attn_mlp(qk_rel + pos_embedding)  # b, dim, n, n_knn
+        attention = torch.softmax(attention, -1)
+        value = value.reshape((b, -1, n, 1)) + pos_embedding  #
+        agg = einsum('b c i j, b c i j -> b c i', attention, value)  # b, dim, n
+        y = self.conv_end(agg)
+        return y + identity

hort/models/tgs/models/snowflake/utils.py ADDED Viewed

	@@ -0,0 +1,741 @@

+# -*- coding: utf-8 -*-
+# @Author: Peng Xiang
+import types
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch import nn, einsum
+from pointnet2_ops.pointnet2_utils import furthest_point_sample, \
+    gather_operation, ball_query, three_nn, three_interpolate, grouping_operation
+class Conv1d(nn.Module):
+    def __init__(self, in_channel, out_channel, kernel_size=1, stride=1, if_bn=True, activation_fn=torch.relu):
+        super(Conv1d, self).__init__()
+        self.conv = nn.Conv1d(in_channel, out_channel, kernel_size, stride=stride)
+        self.if_bn = if_bn
+        self.bn = nn.BatchNorm1d(out_channel)
+        self.activation_fn = activation_fn
+    def forward(self, input):
+        out = self.conv(input)
+        if self.if_bn:
+            out = self.bn(out)
+        if self.activation_fn is not None:
+            out = self.activation_fn(out)
+        return out
+class Conv2d(nn.Module):
+    def __init__(self, in_channel, out_channel, kernel_size=(1, 1), stride=(1, 1), if_bn=True, activation_fn=torch.relu):
+        super(Conv2d, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, stride=stride)
+        self.if_bn = if_bn
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.activation_fn = activation_fn
+    def forward(self, input):
+        out = self.conv(input)
+        if self.if_bn:
+            out = self.bn(out)
+        if self.activation_fn is not None:
+            out = self.activation_fn(out)
+        return out
+class MLP(nn.Module):
+    def __init__(self, in_channel, layer_dims, bn=None):
+        super(MLP, self).__init__()
+        layers = []
+        last_channel = in_channel
+        for out_channel in layer_dims[:-1]:
+            layers.append(nn.Linear(last_channel, out_channel))
+            if bn:
+                layers.append(nn.BatchNorm1d(out_channel))
+            layers.append(nn.ReLU())
+            last_channel = out_channel
+        layers.append(nn.Linear(last_channel, layer_dims[-1]))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, inputs):
+        return self.mlp(inputs)
+class MLP_CONV(nn.Module):
+    def __init__(self, in_channel, layer_dims, bn=None):
+        super(MLP_CONV, self).__init__()
+        layers = []
+        last_channel = in_channel
+        for out_channel in layer_dims[:-1]:
+            layers.append(nn.Conv1d(last_channel, out_channel, 1))
+            if bn:
+                layers.append(nn.BatchNorm1d(out_channel))
+            layers.append(nn.ReLU())
+            last_channel = out_channel
+        layers.append(nn.Conv1d(last_channel, layer_dims[-1], 1))
+        self.mlp = nn.Sequential(*layers)
+    def forward(self, inputs):
+        return self.mlp(inputs)
+class MLP_Res(nn.Module):
+    def __init__(self, in_dim=128, hidden_dim=None, out_dim=128):
+        super(MLP_Res, self).__init__()
+        if hidden_dim is None:
+            hidden_dim = in_dim
+        self.conv_1 = nn.Conv1d(in_dim, hidden_dim, 1)
+        self.conv_2 = nn.Conv1d(hidden_dim, out_dim, 1)
+        self.conv_shortcut = nn.Conv1d(in_dim, out_dim, 1)
+    def forward(self, x):
+        """
+        Args:
+            x: (B, out_dim, n)
+        """
+        shortcut = self.conv_shortcut(x)
+        out = self.conv_2(torch.relu(self.conv_1(x))) + shortcut
+        return out
+def sample_and_group(xyz, points, npoint, nsample, radius, use_xyz=True):
+    """
+    Args:
+        xyz: Tensor, (B, 3, N)
+        points: Tensor, (B, f, N)
+        npoint: int
+        nsample: int
+        radius: float
+        use_xyz: boolean
+    Returns:
+        new_xyz: Tensor, (B, 3, npoint)
+        new_points: Tensor, (B, 3 | f+3 | f, npoint, nsample)
+        idx_local: Tensor, (B, npoint, nsample)
+        grouped_xyz: Tensor, (B, 3, npoint, nsample)
+    """
+    xyz_flipped = xyz.permute(0, 2, 1).contiguous() # (B, N, 3)
+    new_xyz = gather_operation(xyz, furthest_point_sample(xyz_flipped, npoint)) # (B, 3, npoint)
+    idx = ball_query(radius, nsample, xyz_flipped, new_xyz.permute(0, 2, 1).contiguous()) # (B, npoint, nsample)
+    grouped_xyz = grouping_operation(xyz, idx) # (B, 3, npoint, nsample)
+    grouped_xyz -= new_xyz.unsqueeze(3).repeat(1, 1, 1, nsample)
+    if points is not None:
+        grouped_points = grouping_operation(points, idx) # (B, f, npoint, nsample)
+        if use_xyz:
+            new_points = torch.cat([grouped_xyz, grouped_points], 1)
+        else:
+            new_points = grouped_points
+    else:
+        new_points = grouped_xyz
+    return new_xyz, new_points, idx, grouped_xyz
+def sample_and_group_all(xyz, points, use_xyz=True):
+    """
+    Args:
+        xyz: Tensor, (B, 3, nsample)
+        points: Tensor, (B, f, nsample)
+        use_xyz: boolean
+    Returns:
+        new_xyz: Tensor, (B, 3, 1)
+        new_points: Tensor, (B, f|f+3|3, 1, nsample)
+        idx: Tensor, (B, 1, nsample)
+        grouped_xyz: Tensor, (B, 3, 1, nsample)
+    """
+    b, _, nsample = xyz.shape
+    device = xyz.device
+    new_xyz = torch.zeros((1, 3, 1), dtype=torch.float, device=device).repeat(b, 1, 1)
+    grouped_xyz = xyz.reshape((b, 3, 1, nsample))
+    idx = torch.arange(nsample, device=device).reshape(1, 1, nsample).repeat(b, 1, 1)
+    if points is not None:
+        if use_xyz:
+            new_points = torch.cat([xyz, points], 1)
+        else:
+            new_points = points
+        new_points = new_points.unsqueeze(2)
+    else:
+        new_points = grouped_xyz
+    return new_xyz, new_points, idx, grouped_xyz
+class PointNet_SA_Module(nn.Module):
+    def __init__(self, npoint, nsample, radius, in_channel, mlp, if_bn=True, group_all=False, use_xyz=True):
+        """
+        Args:
+            npoint: int, number of points to sample
+            nsample: int, number of points in each local region
+            radius: float
+            in_channel: int, input channel of features(points)
+            mlp: list of int,
+        """
+        super(PointNet_SA_Module, self).__init__()
+        self.npoint = npoint
+        self.nsample = nsample
+        self.radius = radius
+        self.mlp = mlp
+        self.group_all = group_all
+        self.use_xyz = use_xyz
+        if use_xyz:
+            in_channel += 3
+        last_channel = in_channel
+        self.mlp_conv = []
+        for out_channel in mlp:
+            self.mlp_conv.append(Conv2d(last_channel, out_channel, if_bn=if_bn))
+            last_channel = out_channel
+        self.mlp_conv = nn.Sequential(*self.mlp_conv)
+    def forward(self, xyz, points):
+        """
+        Args:
+            xyz: Tensor, (B, 3, N)
+            points: Tensor, (B, f, N)
+        Returns:
+            new_xyz: Tensor, (B, 3, npoint)
+            new_points: Tensor, (B, mlp[-1], npoint)
+        """
+        if self.group_all:
+            new_xyz, new_points, idx, grouped_xyz = sample_and_group_all(xyz, points, self.use_xyz)
+        else:
+            new_xyz, new_points, idx, grouped_xyz = sample_and_group(xyz, points, self.npoint, self.nsample, self.radius, self.use_xyz)
+        new_points = self.mlp_conv(new_points)
+        new_points = torch.max(new_points, 3)[0]
+        return new_xyz, new_points
+class PointNet_FP_Module(nn.Module):
+    def __init__(self, in_channel, mlp, use_points1=False, in_channel_points1=None, if_bn=True):
+        """
+        Args:
+            in_channel: int, input channel of points2
+            mlp: list of int
+            use_points1: boolean, if use points
+            in_channel_points1: int, input channel of points1
+        """
+        super(PointNet_FP_Module, self).__init__()
+        self.use_points1 = use_points1
+        if use_points1:
+            in_channel += in_channel_points1
+        last_channel = in_channel
+        self.mlp_conv = []
+        for out_channel in mlp:
+            self.mlp_conv.append(Conv1d(last_channel, out_channel, if_bn=if_bn))
+            last_channel = out_channel
+        self.mlp_conv = nn.Sequential(*self.mlp_conv)
+    def forward(self, xyz1, xyz2, points1, points2):
+        """
+        Args:
+            xyz1: Tensor, (B, 3, N)
+            xyz2: Tensor, (B, 3, M)
+            points1: Tensor, (B, in_channel, N)
+            points2: Tensor, (B, in_channel, M)
+        Returns:MLP_CONV
+            new_points: Tensor, (B, mlp[-1], N)
+        """
+        dist, idx = three_nn(xyz1.permute(0, 2, 1).contiguous(), xyz2.permute(0, 2, 1).contiguous())
+        dist = torch.clamp_min(dist, 1e-10)  # (B, N, 3)
+        recip_dist = 1.0/dist
+        norm = torch.sum(recip_dist, 2, keepdim=True).repeat((1, 1, 3))
+        weight = recip_dist / norm
+        interpolated_points = three_interpolate(points2, idx, weight) # B, in_channel, N
+        if self.use_points1:
+            new_points = torch.cat([interpolated_points, points1], 1)
+        else:
+            new_points = interpolated_points
+        new_points = self.mlp_conv(new_points)
+        return new_points
+def square_distance(src, dst):
+    """
+    Calculate Euclid distance between each two points.
+    src^T * dst = xn * xm + yn * ym + zn * zm；
+    sum(src^2, dim=-1) = xn*xn + yn*yn + zn*zn;
+    sum(dst^2, dim=-1) = xm*xm + ym*ym + zm*zm;
+    dist = (xn - xm)^2 + (yn - ym)^2 + (zn - zm)^2
+         = sum(src**2,dim=-1)+sum(dst**2,dim=-1)-2*src^T*dst
+    Input:
+        src: source points, [B, N, C]
+        dst: target points, [B, M, C]
+    Output:
+        dist: per-point square distance, [B, N, M]
+    """
+    B, N, _ = src.shape
+    _, M, _ = dst.shape
+    dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))  # B, N, M
+    dist += torch.sum(src ** 2, -1).view(B, N, 1)
+    dist += torch.sum(dst ** 2, -1).view(B, 1, M)
+    return dist
+def query_knn(nsample, xyz, new_xyz, include_self=True):
+    """Find k-NN of new_xyz in xyz"""
+    pad = 0 if include_self else 1
+    sqrdists = square_distance(new_xyz, xyz)  # B, S, N
+    idx = torch.argsort(sqrdists, dim=-1, descending=False)[:, :, pad: nsample+pad]
+    return idx.int()
+def sample_and_group_knn(xyz, points, npoint, k, use_xyz=True, idx=None):
+    """
+    Args:
+        xyz: Tensor, (B, 3, N)
+        points: Tensor, (B, f, N)
+        npoint: int
+        nsample: int
+        radius: float
+        use_xyz: boolean
+    Returns:
+        new_xyz: Tensor, (B, 3, npoint)
+        new_points: Tensor, (B, 3 | f+3 | f, npoint, nsample)
+        idx_local: Tensor, (B, npoint, nsample)
+        grouped_xyz: Tensor, (B, 3, npoint, nsample)
+    """
+    xyz_flipped = xyz.permute(0, 2, 1).contiguous() # (B, N, 3)
+    new_xyz = gather_operation(xyz, furthest_point_sample(xyz_flipped, npoint)) # (B, 3, npoint)
+    if idx is None:
+        idx = query_knn(k, xyz_flipped, new_xyz.permute(0, 2, 1).contiguous())
+    grouped_xyz = grouping_operation(xyz, idx) # (B, 3, npoint, nsample)
+    grouped_xyz -= new_xyz.unsqueeze(3).repeat(1, 1, 1, k)
+    if points is not None:
+        grouped_points = grouping_operation(points, idx) # (B, f, npoint, nsample)
+        if use_xyz:
+            new_points = torch.cat([grouped_xyz, grouped_points], 1)
+        else:
+            new_points = grouped_points
+    else:
+        new_points = grouped_xyz
+    return new_xyz, new_points, idx, grouped_xyz
+class PointNet_SA_Module_KNN(nn.Module):
+    def __init__(self, npoint, nsample, in_channel, mlp, if_bn=True, group_all=False, use_xyz=True, if_idx=False):
+        """
+        Args:
+            npoint: int, number of points to sample
+            nsample: int, number of points in each local region
+            radius: float
+            in_channel: int, input channel of features(points)
+            mlp: list of int,
+        """
+        super(PointNet_SA_Module_KNN, self).__init__()
+        self.npoint = npoint
+        self.nsample = nsample
+        self.mlp = mlp
+        self.group_all = group_all
+        self.use_xyz = use_xyz
+        self.if_idx = if_idx
+        if use_xyz:
+            in_channel += 3
+        last_channel = in_channel
+        self.mlp_conv = []
+        for out_channel in mlp[:-1]:
+            self.mlp_conv.append(Conv2d(last_channel, out_channel, if_bn=if_bn))
+            last_channel = out_channel
+        self.mlp_conv.append(Conv2d(last_channel, mlp[-1], if_bn=False, activation_fn=None))
+        self.mlp_conv = nn.Sequential(*self.mlp_conv)
+    def forward(self, xyz, points, idx=None):
+        """
+        Args:
+            xyz: Tensor, (B, 3, N)
+            points: Tensor, (B, f, N)
+        Returns:
+            new_xyz: Tensor, (B, 3, npoint)
+            new_points: Tensor, (B, mlp[-1], npoint)
+        """
+        if self.group_all:
+            new_xyz, new_points, idx, grouped_xyz = sample_and_group_all(xyz, points, self.use_xyz)
+        else:
+            new_xyz, new_points, idx, grouped_xyz = sample_and_group_knn(xyz, points, self.npoint, self.nsample, self.use_xyz, idx=idx)
+        new_points = self.mlp_conv(new_points)
+        new_points = torch.max(new_points, 3)[0]
+        if self.if_idx:
+            return new_xyz, new_points, idx
+        else:
+            return new_xyz, new_points
+def fps_subsample(pcd, n_points=2048):
+    """
+    Args
+        pcd: (b, 16384, 3)
+    returns
+        new_pcd: (b, n_points, 3)
+    """
+    new_pcd = gather_operation(pcd.permute(0, 2, 1).contiguous(), furthest_point_sample(pcd, n_points))
+    new_pcd = new_pcd.permute(0, 2, 1).contiguous()
+    return new_pcd
+class Transformer(nn.Module):
+    def __init__(self, in_channel, dim=256, n_knn=16, pos_hidden_dim=64, attn_hidden_multiplier=4):
+        super(Transformer, self).__init__()
+        self.n_knn = n_knn
+        self.conv_key = nn.Conv1d(dim, dim, 1)
+        self.conv_query = nn.Conv1d(dim, dim, 1)
+        self.conv_value = nn.Conv1d(dim, dim, 1)
+        self.pos_mlp = nn.Sequential(
+            nn.Conv2d(3, pos_hidden_dim, 1),
+            nn.BatchNorm2d(pos_hidden_dim),
+            nn.ReLU(),
+            nn.Conv2d(pos_hidden_dim, dim, 1)
+        )
+        self.attn_mlp = nn.Sequential(
+            nn.Conv2d(dim, dim * attn_hidden_multiplier, 1),
+            nn.BatchNorm2d(dim * attn_hidden_multiplier),
+            nn.ReLU(),
+            nn.Conv2d(dim * attn_hidden_multiplier, dim, 1)
+        )
+        self.linear_start = nn.Conv1d(in_channel, dim, 1)
+        self.linear_end = nn.Conv1d(dim, in_channel, 1)
+    def forward(self, x, pos):
+        """feed forward of transformer
+        Args:
+            x: Tensor of features, (B, in_channel, n)
+            pos: Tensor of positions, (B, 3, n)
+        Returns:
+            y: Tensor of features with attention, (B, in_channel, n)
+        """
+        identity = x
+        x = self.linear_start(x)
+        b, dim, n = x.shape
+        pos_flipped = pos.permute(0, 2, 1).contiguous()
+        idx_knn = query_knn(self.n_knn, pos_flipped, pos_flipped)
+        key = self.conv_key(x)
+        value = self.conv_value(x)
+        query = self.conv_query(x)
+        key = grouping_operation(key, idx_knn)  # b, dim, n, n_knn
+        qk_rel = query.reshape((b, -1, n, 1)) - key
+        pos_rel = pos.reshape((b, -1, n, 1)) - grouping_operation(pos, idx_knn)  # b, 3, n, n_knn
+        pos_embedding = self.pos_mlp(pos_rel)  # b, dim, n, n_knn
+        attention = self.attn_mlp(qk_rel + pos_embedding)
+        attention = torch.softmax(attention, -1)
+        value = value.reshape((b, -1, n, 1)) + pos_embedding
+        agg = einsum('b c i j, b c i j -> b c i', attention, value)  # b, dim, n
+        y = self.linear_end(agg)
+        return y+identity
+class CouplingLayer(nn.Module):
+    def __init__(self, d, intermediate_dim, swap=False):
+        nn.Module.__init__(self)
+        self.d = d - (d // 2)
+        self.swap = swap
+        self.net_s_t = nn.Sequential(
+            nn.Linear(self.d, intermediate_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(intermediate_dim, intermediate_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(intermediate_dim, (d - self.d) * 2),
+        )
+    def forward(self, x, logpx=None, reverse=False):
+        if self.swap:
+            x = torch.cat([x[:, self.d:], x[:, :self.d]], 1)
+        in_dim = self.d
+        out_dim = x.shape[1] - self.d
+        s_t = self.net_s_t(x[:, :in_dim])
+        scale = torch.sigmoid(s_t[:, :out_dim] + 2.)
+        shift = s_t[:, out_dim:]
+        logdetjac = torch.sum(torch.log(scale).view(scale.shape[0], -1), 1, keepdim=True)
+        if not reverse:
+            y1 = x[:, self.d:] * scale + shift
+            delta_logp = -logdetjac
+        else:
+            y1 = (x[:, self.d:] - shift) / scale
+            delta_logp = logdetjac
+        y = torch.cat([x[:, :self.d], y1], 1) if not self.swap else torch.cat([y1, x[:, :self.d]], 1)
+        if logpx is None:
+            return y
+        else:
+            return y, logpx + delta_logp
+class SequentialFlow(nn.Module):
+    """A generalized nn.Sequential container for normalizing flows.
+    """
+    def __init__(self, layersList):
+        super(SequentialFlow, self).__init__()
+        self.chain = nn.ModuleList(layersList)
+    def forward(self, x, logpx=None, reverse=False, inds=None):
+        if inds is None:
+            if reverse:
+                inds = range(len(self.chain) - 1, -1, -1)
+            else:
+                inds = range(len(self.chain))
+        if logpx is None:
+            for i in inds:
+                x = self.chain[i](x, reverse=reverse)
+            return x
+        else:
+            for i in inds:
+                x, logpx = self.chain[i](x, logpx, reverse=reverse)
+            return x, logpx
+def build_latent_flow(args):
+    chain = []
+    for i in range(args.latent_flow_depth):
+        chain.append(CouplingLayer(args.latent_dim, args.latent_flow_hidden_dim, swap=(i % 2 == 0)))
+    return SequentialFlow(chain)
+##################
+## SpectralNorm ##
+##################
+POWER_ITERATION_FN = "spectral_norm_power_iteration"
+class SpectralNorm(object):
+    def __init__(self, name='weight', dim=0, eps=1e-12):
+        self.name = name
+        self.dim = dim
+        self.eps = eps
+    def compute_weight(self, module, n_power_iterations):
+        if n_power_iterations < 0:
+            raise ValueError(
+                'Expected n_power_iterations to be non-negative, but '
+                'got n_power_iterations={}'.format(n_power_iterations)
+            )
+        weight = getattr(module, self.name + '_orig')
+        u = getattr(module, self.name + '_u')
+        v = getattr(module, self.name + '_v')
+        weight_mat = weight
+        if self.dim != 0:
+            # permute dim to front
+            weight_mat = weight_mat.permute(self.dim, * [d for d in range(weight_mat.dim()) if d != self.dim])
+        height = weight_mat.size(0)
+        weight_mat = weight_mat.reshape(height, -1)
+        with torch.no_grad():
+            for _ in range(n_power_iterations):
+                # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+                # are the first left and right singular vectors.
+                # This power iteration produces approximations of `u` and `v`.
+                v = F.normalize(torch.matmul(weight_mat.t(), u), dim=0, eps=self.eps)
+                u = F.normalize(torch.matmul(weight_mat, v), dim=0, eps=self.eps)
+        setattr(module, self.name + '_u', u)
+        setattr(module, self.name + '_v', v)
+        sigma = torch.dot(u, torch.matmul(weight_mat, v))
+        weight = weight / sigma
+        setattr(module, self.name, weight)
+    def remove(self, module):
+        weight = getattr(module, self.name)
+        delattr(module, self.name)
+        delattr(module, self.name + '_u')
+        delattr(module, self.name + '_orig')
+        module.register_parameter(self.name, torch.nn.Parameter(weight))
+    def get_update_method(self, module):
+        def update_fn(module, n_power_iterations):
+            self.compute_weight(module, n_power_iterations)
+        return update_fn
+    def __call__(self, module, unused_inputs):
+        del unused_inputs
+        self.compute_weight(module, n_power_iterations=0)
+        # requires_grad might be either True or False during inference.
+        if not module.training:
+            r_g = getattr(module, self.name + '_orig').requires_grad
+            setattr(module, self.name, getattr(module, self.name).detach().requires_grad_(r_g))
+    @staticmethod
+    def apply(module, name, dim, eps):
+        fn = SpectralNorm(name, dim, eps)
+        weight = module._parameters[name]
+        height = weight.size(dim)
+        u = F.normalize(weight.new_empty(height).normal_(0, 1), dim=0, eps=fn.eps)
+        v = F.normalize(weight.new_empty(int(weight.numel() / height)).normal_(0, 1), dim=0, eps=fn.eps)
+        delattr(module, fn.name)
+        module.register_parameter(fn.name + "_orig", weight)
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a
+        # buffer, which will cause weight to be included in the state dict
+        # and also supports nn.init due to shared storage.
+        module.register_buffer(fn.name, weight.data)
+        module.register_buffer(fn.name + "_u", u)
+        module.register_buffer(fn.name + "_v", v)
+        setattr(module, POWER_ITERATION_FN, types.MethodType(fn.get_update_method(module), module))
+        module.register_forward_pre_hook(fn)
+        return fn
+def inplace_spectral_norm(module, name='weight', dim=None, eps=1e-12):
+    r"""Applies spectral normalization to a parameter in the given module.
+    .. math::
+         \mathbf{W} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})} \\
+         \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generaive Adversarial Networks (GANs) by rescaling the weight tensor
+    with spectral norm :math:`\sigma` of the weight matrix calculated using
+    power iteration method. If the dimension of the weight tensor is greater
+    than 2, it is reshaped to 2D in power iteration method to get spectral
+    norm. This is implemented via a hook that calculates spectral norm and
+    rescales weight before every :meth:`~Module.forward` call.
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectal norm
+        dim (int, optional): dimension corresponding to number of outputs,
+            the default is 0, except for modules that are instances of
+            ConvTranspose1/2/3d, when it is 1
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms
+    Returns:
+        The original module with the spectal norm hook
+    Example::
+        >>> m = spectral_norm(nn.Linear(20, 40))
+        Linear (20 -> 40)
+        >>> m.weight_u.size()
+        torch.Size([20])
+    """
+    if dim is None:
+        if isinstance(module, (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(module, name, dim=dim, eps=eps)
+    return module
+def remove_spectral_norm(module, name='weight'):
+    r"""Removes the spectral normalization reparameterization from a module.
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+    Example:
+        >>> m = spectral_norm(nn.Linear(40, 10))
+        >>> remove_spectral_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, SpectralNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+    raise ValueError("spectral_norm of '{}' not found in {}".format(name, module))
+def add_spectral_norm(model, logger=None):
+    """Applies spectral norm to all modules within the scope of a CNF."""
+    def apply_spectral_norm(module):
+        if 'weight' in module._parameters:
+            if logger: logger.info("Adding spectral norm to {}".format(module))
+            inplace_spectral_norm(module, 'weight')
+    def find_coupling_layer(module):
+        if isinstance(module, CouplingLayer):
+            module.apply(apply_spectral_norm)
+        else:
+            for child in module.children():
+                find_coupling_layer(child)
+    find_coupling_layer(model)
+def spectral_norm_power_iteration(model, n_power_iterations=1):
+    def recursive_power_iteration(module):
+        if hasattr(module, POWER_ITERATION_FN):
+            getattr(module, POWER_ITERATION_FN)(n_power_iterations)
+    model.apply(recursive_power_iteration)
+def reparameterize_gaussian(mean, logvar):
+    std = torch.exp(0.5 * logvar)
+    eps = torch.randn(std.size()).to(mean)
+    return mean + std * eps
+def gaussian_entropy(logvar):
+    const = 0.5 * float(logvar.size(1)) * (1. + np.log(np.pi * 2))
+    ent = 0.5 * logvar.sum(dim=1, keepdim=False) + const
+    return ent
+def standard_normal_logprob(z):
+    dim = z.size(-1)
+    log_z = -0.5 * dim * np.log(2 * np.pi)
+    return log_z - z.pow(2) / 2
+def truncated_normal_(tensor, mean=0, std=1, trunc_std=2):
+    """
+    Taken from https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
+    """
+    size = tensor.shape
+    tmp = tensor.new_empty(size + (4,)).normal_()
+    valid = (tmp < trunc_std) & (tmp > -trunc_std)
+    ind = valid.max(-1, keepdim=True)[1]
+    tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+    tensor.data.mul_(std).add_(mean)
+    return tensor

hort/models/tgs/models/tokenizers/dinov2.py ADDED Viewed

	@@ -0,0 +1,1179 @@

+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DINOv2 model."""
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+from dataclasses import dataclass
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.backbone_utils import BackboneMixin
+from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
+from tgs.models.transformers import MemoryEfficientAttentionMixin
+from tgs.utils.typing import *
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2Config"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-base"
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/dinov2-base",
+    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
+]
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        # register as mask token as it's not used in optimization
+        # to avoid the use of find_unused_parameters_true
+        # self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_buffer("mask_token", torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        patch_embeddings = self.patch_embeddings(pixel_values)
+        embeddings = patch_embeddings
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1),
+                self.mask_token.to(embeddings.dtype).unsqueeze(0),
+                embeddings,
+            )
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+        self.query = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.use_memory_efficient_attention_xformers: bool = False
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        if self.use_memory_efficient_attention_xformers:
+            import xformers
+            assert head_mask is None and not output_attentions
+            new_size = hidden_states.size()[:-1] + (
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+            key_layer = self.key(hidden_states).view(new_size)
+            value_layer = self.value(hidden_states).view(new_size)
+            query_layer = mixed_query_layer.view(new_size)
+            context_layer = xformers.ops.memory_efficient_attention(
+                query_layer, key_layer, value_layer, p=self.attention_probs_dropout_prob
+            )
+            context_layer = context_layer.view(*hidden_states.size()[:-1], -1)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            query_layer = self.transpose_for_scores(mixed_query_layer)
+            try:
+                context_layer = F.scaled_dot_product_attention(query_layer, key_layer, value_layer, attn_mask=head_mask, dropout_p=(self.dropout.p if self.training else 0.0), scale=1/math.sqrt(self.attention_head_size))
+            except:
+                # Take the dot product between "query" and "key" to get the raw attention scores.
+                attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+                attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+                # Normalize the attention scores to probabilities.
+                attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+                # This is actually dropping out entire tokens to attend to, which might
+                # seem a bit unusual, but is taken from the original Transformer paper.
+                attention_probs = self.dropout(attention_probs)
+                # Mask heads if we want to
+                if head_mask is not None:
+                    attention_probs = attention_probs * head_mask
+                context_layer = torch.matmul(attention_probs, value_layer)
+            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+            context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        return outputs
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ):
+        self.use_memory_efficient_attention_xformers = valid
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(
+            config.layerscale_value * torch.ones(config.hidden_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(
+    input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (
+        input.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=input.dtype, device=input.device
+    )
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+class Dinov2Layer(nn.Module, MemoryEfficientAttentionMixin):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm1_modulation = None
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path1 = (
+            Dinov2DropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2_modulation = None
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+        self.drop_path2 = (
+            Dinov2DropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        hidden_states_norm = self.norm1(hidden_states)
+        if self.norm1_modulation is not None:
+            assert modulation_cond is not None
+            hidden_states_norm = self.norm1_modulation(
+                hidden_states_norm, modulation_cond
+            )
+        self_attention_outputs = self.attention(
+            hidden_states_norm,  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        if self.norm2_modulation is not None:
+            assert modulation_cond is not None
+            layer_output = self.norm2_modulation(layer_output, modulation_cond)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+        # second residual connection
+        layer_output = layer_output + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+    def register_ada_norm_modulation(self, norm1_mod: nn.Module, norm2_mod: nn.Module):
+        self.norm1_modulation = norm1_mod
+        self.norm2_modulation = norm2_mod
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class Dinov2Encoder(nn.Module, MemoryEfficientAttentionMixin):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Dinov2Layer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                    modulation_cond,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, layer_head_mask, modulation_cond, output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class Dinov2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+    def _set_gradient_checkpointing(
+        self, module: Dinov2Encoder, value: bool = False
+    ) -> None:
+        if isinstance(module, Dinov2Encoder):
+            module.gradient_checkpointing = value
+DINOV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DINOV2_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@dataclass
+class CustomBaseModelOutputWithPooling(BaseModelOutputWithPooling):
+    patch_embeddings: Optional[torch.FloatTensor] = None
+@add_start_docstrings(
+    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Model(Dinov2PreTrainedModel, MemoryEfficientAttentionMixin):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        modulation_cond: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            modulation_cond=modulation_cond,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+        return CustomBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            patch_embeddings=embedding_output
+        )
+    def set_gradient_checkpointing(self, value: bool = False) -> None:
+        self._set_gradient_checkpointing(self.encoder, value)
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.dinov2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [
+            config.hidden_size for _ in range(config.num_hidden_layers + 1)
+        ]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        embedding_output = self.embeddings(pixel_values)
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state[:, 1:, :].reshape(
+                        batch_size, width // patch_size, height // patch_size, -1
+                    )
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
+class CustomPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, image_size: int, patch_size: int, num_channels: int, hidden_size: int):
+        super().__init__()
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+class CustomEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, image_size: int, patch_size: int, num_channels: int, hidden_size: int) -> None:
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.cls_token = nn.Parameter(torch.randn(1, 1, self.hidden_size))
+        self.patch_embeddings = CustomPatchEmbeddings(image_size, patch_size, num_channels, hidden_size)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, self.hidden_size)
+        )
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.patch_size
+        width = width // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(
+                height / math.sqrt(num_positions),
+                width / math.sqrt(num_positions),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self, pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        patch_embeddings = self.patch_embeddings(pixel_values)
+        embeddings = patch_embeddings
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        return embeddings

hort/models/tgs/models/tokenizers/image.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange
+from tgs.utils.base import BaseModule
+from tgs.models.tokenizers.dinov2 import Dinov2Model
+from tgs.models.transformers import Modulation
+from tgs.utils.typing import *
+class DINOV2SingleImageTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: str = "facebook/dinov2-base"
+        width: int = 224
+        height: int = 224
+        modulation: bool = False
+        modulation_zero_init: bool = False
+        modulation_single_layer: bool = False
+        modulation_cond_dim: int = 16
+        freeze_backbone_params: bool = True
+        enable_memory_efficient_attention: bool = False
+        enable_gradient_checkpointing: bool = False
+        use_patch_embeddings: bool = False
+        patch_embeddings_aggr_method: str = 'concat'
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        model: Dinov2Model
+        if self.cfg.freeze_backbone_params:
+            # freeze dino backbone parameters
+            self.register_non_module(
+                "model",
+                Dinov2Model.from_pretrained(self.cfg.pretrained_model_name_or_path).to(
+                    self.device
+                ),
+            )
+            model = self.non_module("model")
+            for p in model.parameters():
+                p.requires_grad_(False)
+            model.eval()
+        else:
+            self.model = Dinov2Model.from_pretrained(
+                self.cfg.pretrained_model_name_or_path
+            ).to(self.device)
+            model = self.model
+        model.set_use_memory_efficient_attention_xformers(
+            self.cfg.enable_memory_efficient_attention
+        )
+        model.set_gradient_checkpointing(self.cfg.enable_gradient_checkpointing)
+        # add modulation
+        if self.cfg.modulation:
+            modulations = []
+            for layer in model.encoder.layer:
+                norm1_modulation = Modulation(
+                    model.config.hidden_size,
+                    self.cfg.modulation_cond_dim,
+                    zero_init=self.cfg.modulation_zero_init,
+                    single_layer=self.cfg.modulation_single_layer,
+                )
+                norm2_modulation = Modulation(
+                    model.config.hidden_size,
+                    self.cfg.modulation_cond_dim,
+                    zero_init=self.cfg.modulation_zero_init,
+                    single_layer=self.cfg.modulation_single_layer,
+                )
+                layer.register_ada_norm_modulation(norm1_modulation, norm2_modulation)
+                modulations += [norm1_modulation, norm2_modulation]
+            self.modulations = nn.ModuleList(modulations)
+    def forward(
+        self,
+        images: Float[Tensor, "B *N C H W"],
+        modulation_cond: Optional[Float[Tensor, "B *N Cc"]],
+    ) -> Float[Tensor, "B *N Ct Nt"]:
+        model: Dinov2Model
+        if self.cfg.freeze_backbone_params:
+            model = self.non_module("model")
+        else:
+            model = self.model
+        packed = False
+        if images.ndim == 4:
+            packed = True
+            images = images.unsqueeze(1)
+            if modulation_cond is not None:
+                assert modulation_cond.ndim == 2
+                modulation_cond = modulation_cond.unsqueeze(1)
+        batch_size, n_input_views = images.shape[:2]
+        out = model(
+            rearrange(images, "B N C H W -> (B N) C H W"),
+            modulation_cond=rearrange(modulation_cond, "B N Cc -> (B N) Cc")
+            if modulation_cond is not None
+            else None,
+        )
+        local_features, global_features = out.last_hidden_state, out.pooler_output
+        if self.cfg.use_patch_embeddings:
+            patch_embeddings = out.patch_embeddings
+            if self.cfg.patch_embeddings_aggr_method == 'concat':
+                local_features = torch.cat([local_features, patch_embeddings], dim=1)
+            elif self.cfg.patch_embeddings_aggr_method == 'add':
+                local_features = local_features + patch_embeddings
+            else:
+                raise NotImplementedError
+        local_features = local_features.permute(0, 2, 1)
+        local_features = rearrange(
+            local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size
+        )
+        if packed:
+            local_features = local_features.squeeze(1)
+        return local_features
+    def detokenize(self, *args, **kwargs):
+        raise NotImplementedError