Spaces:

bytedance-research
/

RealCustom

Runtime error

App Files Files Community

CoreloneH commited on 13 days ago

Commit

7cc4b41

1 Parent(s): 5207cf4

Add application file

Browse files

Files changed (47) hide show

configs/realcustom_sigdino_highres.json +119 -0
configs/realcustom_sigdino_highres_shallow.json +114 -0
inference/__pycache__/inference_utils.cpython-310.pyc +0 -0
inference/__pycache__/mask_generation.cpython-310.pyc +0 -0
inference/__pycache__/pipeline.cpython-310.pyc +0 -0
inference/app.py +82 -0
inference/inference_single_image.py +317 -0
inference/inference_single_image.sh +55 -0
inference/inference_utils.py +76 -0
inference/mask_generation.py +114 -0
inference/pipeline.py +359 -0
models/__pycache__/attention_custom.cpython-310.pyc +0 -0
models/__pycache__/attention_processor_custom_cross.cpython-310.pyc +0 -0
models/__pycache__/base_vision.cpython-310.pyc +0 -0
models/__pycache__/dino.cpython-310.pyc +0 -0
models/__pycache__/image_encoder_siglipdino_shallowdeep.cpython-310.pyc +0 -0
models/__pycache__/projectors.cpython-310.pyc +0 -0
models/__pycache__/sigclip.cpython-310.pyc +0 -0
models/__pycache__/text.cpython-310.pyc +0 -0
models/__pycache__/transformer_2d_custom.cpython-310.pyc +0 -0
models/__pycache__/unet_2d_blocks_custom.cpython-310.pyc +0 -0
models/__pycache__/unet_2d_condition_custom.cpython-310.pyc +0 -0
models/__pycache__/vae.cpython-310.pyc +0 -0
models/attention_custom.py +425 -0
models/attention_processor_custom_cross.py +1778 -0
models/base_vision.py +227 -0
models/dino.py +203 -0
models/image_encoder_siglipdino_shallowdeep.py +162 -0
models/projectors.py +150 -0
models/sigclip.py +159 -0
models/text.py +113 -0
models/transformer_2d_custom.py +388 -0
models/unet_2d_blocks_custom.py +0 -0
models/unet_2d_condition_custom.py +1059 -0
models/vae.py +36 -0
prompts/validation_negative.txt +1 -0
requirements.txt +34 -0
schedulers/__pycache__/base.cpython-310.pyc +0 -0
schedulers/__pycache__/ddim.cpython-310.pyc +0 -0
schedulers/__pycache__/dpm_s.cpython-310.pyc +0 -0
schedulers/__pycache__/utils.cpython-310.pyc +0 -0
schedulers/base.py +133 -0
schedulers/ddim.py +85 -0
schedulers/dpm_m.py +412 -0
schedulers/dpm_s.py +243 -0
schedulers/utils.py +124 -0
utils.py +55 -0

configs/realcustom_sigdino_highres.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+    "act_fn": "silu",
+    "addition_embed_type": "text_time",
+    "addition_embed_type_num_heads": 64,
+    "addition_time_embed_dim": 256,
+    "attention_head_dim": [
+        5,
+        10,
+        20
+    ],
+    "block_out_channels": [
+        320,
+        640,
+        1280
+    ],
+    "center_input_sample": false,
+    "class_embed_type": null,
+    "class_embeddings_concat": false,
+    "conv_in_kernel": 3,
+    "conv_out_kernel": 3,
+    "cross_attention_dim": 2048,
+    "cross_attention_norm": null,
+    "down_block_types": [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D"
+    ],
+    "downsample_padding": 1,
+    "dual_cross_attention": false,
+    "encoder_hid_dim": null,
+    "encoder_hid_dim_type": null,
+    "flip_sin_to_cos": true,
+    "freq_shift": 0,
+    "in_channels": 4,
+    "layers_per_block": 2,
+    "mid_block_only_cross_attention": null,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "num_attention_heads": null,
+    "num_class_embeds": null,
+    "only_cross_attention": false,
+    "out_channels": 4,
+    "projection_class_embeddings_input_dim": 2816,
+    "resnet_out_scale_factor": 1.0,
+    "resnet_skip_time_act": false,
+    "resnet_time_scale_shift": "default",
+    "sample_size": 128,
+    "time_cond_proj_dim": null,
+    "time_embedding_act_fn": null,
+    "time_embedding_dim": null,
+    "time_embedding_type": "positional",
+    "timestep_post_act": null,
+    "transformer_layers_per_block": [
+        1,
+        2,
+        10
+    ],
+    "up_block_types": [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D"
+    ],
+    "upcast_attention": false,
+    "use_linear_projection": true,
+    "image_ref_processor_config": {
+        "target": "utils.image_ref_processor.default.NaiveResizeProcessor",
+        "params": {
+            "target_image_size": 768,
+            "resize_mode": "resize",
+            "crop_min_ratio": 1.0,
+            "crop_max_ratio": 1.0,
+            "image_dropout": 0.1
+        }
+    },
+    "image_ref_processor_input_keys": [
+        "image_ref"
+    ],
+    "vision_model_config": {
+        "vision_model_config": {
+            "target": "models.image_encoder_siglipdino_shallowdeep.ShallowDeepPatchfySiglipDinoEncoder",
+            "params": {
+                "siglip_config": {
+                    "backbone_name_or_path": "vit_so400m_patch14_siglip_384",
+                    "image_resize_strategy": "resize-naive",
+                    "default_image_size": 384,
+                    "feature_index": [
+                        25
+                    ]
+                },
+                "dino_config": {
+                    "backbone_name_or_path": "vit_large_patch14_reg4_dinov2.lvd142m",
+                    "image_resize_strategy": "resize-naive",
+                    "default_image_size": 384,
+                    "feature_index": [
+                        22
+                    ]
+                },
+                "patchfy_scale": 2,
+                "default_image_size": 384
+            }
+        }
+    },
+    "image_prompt_settings": {
+        "vision_projection_type": "custom",
+        "vision_projection_config": {
+            "target": "models.projectors.ProjectorHighResMinAttn",
+            "params": {
+                "vision_dim": 2176,
+                "out_dim": 2048,
+                "dim_head": 64,
+                "adaptive_scale": false
+            }
+        },
+        "image_prompt_mode": "naive",
+        "cross_attention_id": 70
+    }
+}

configs/realcustom_sigdino_highres_shallow.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+    "act_fn": "silu",
+    "addition_embed_type": "text_time",
+    "addition_embed_type_num_heads": 64,
+    "addition_time_embed_dim": 256,
+    "attention_head_dim": [
+        5,
+        10,
+        20
+    ],
+    "block_out_channels": [
+        320,
+        640,
+        1280
+    ],
+    "center_input_sample": false,
+    "class_embed_type": null,
+    "class_embeddings_concat": false,
+    "conv_in_kernel": 3,
+    "conv_out_kernel": 3,
+    "cross_attention_dim": 2048,
+    "cross_attention_norm": null,
+    "down_block_types": [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D"
+    ],
+    "downsample_padding": 1,
+    "dual_cross_attention": false,
+    "encoder_hid_dim": null,
+    "encoder_hid_dim_type": null,
+    "flip_sin_to_cos": true,
+    "freq_shift": 0,
+    "in_channels": 4,
+    "layers_per_block": 2,
+    "mid_block_only_cross_attention": null,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "num_attention_heads": null,
+    "num_class_embeds": null,
+    "only_cross_attention": false,
+    "out_channels": 4,
+    "projection_class_embeddings_input_dim": 2816,
+    "resnet_out_scale_factor": 1.0,
+    "resnet_skip_time_act": false,
+    "resnet_time_scale_shift": "default",
+    "sample_size": 128,
+    "time_cond_proj_dim": null,
+    "time_embedding_act_fn": null,
+    "time_embedding_dim": null,
+    "time_embedding_type": "positional",
+    "timestep_post_act": null,
+    "transformer_layers_per_block": [
+      1,
+      2,
+      10
+    ],
+    "up_block_types": [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D"
+    ],
+    "upcast_attention": false,
+    "use_linear_projection": true,
+    "image_ref_processor_input_keys": ["image_ref"],
+    "vision_model_config": {
+        "vision_model_config": {
+            "target": "models.image_encoder_siglipdino_shallowdeep.ShallowDeepPatchfySiglipDinoEncoder_v2",
+            "params": {
+                "siglip_config": {
+                    "backbone_name_or_path": "vit_so400m_patch14_siglip_384",
+                    "image_resize_strategy": "resize-naive",
+                    "default_image_size": 384,
+                    "feature_index": [
+                        7,
+                        13,
+                        19,
+                        25
+                    ]
+                },
+                "dino_config": {
+                    "backbone_name_or_path": "vit_large_patch14_reg4_dinov2.lvd142m",
+                    "image_resize_strategy": "resize-naive",
+                    "default_image_size": 384,
+                    "feature_index": [
+                        4,
+                        10,
+                        16,
+                        22
+                    ]
+                },
+                "patchfy_scale": 2,
+                "default_image_size": 384
+            }
+        }
+    },
+    "image_prompt_settings": {
+        "vision_projection_type": "custom",
+        "vision_projection_config": {
+            "target": "models.projectors.ProjectorHighResShallowMinAttnV1",
+            "params": {
+                "vision_dim": 2176,
+                "out_dim": 2048,
+                "dim_head": 64
+            }
+        },
+        "image_prompt_mode": "naive"
+    }
+}

inference/__pycache__/inference_utils.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

inference/__pycache__/mask_generation.cpython-310.pyc ADDED Viewed

Binary file (2.58 kB). View file

inference/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

inference/app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gradio as gr
+from inference.pipeline import RealCustomInferencePipeline
+def create_demo():
+    pipeline = RealCustomInferencePipeline(
+        unet_config="configs/realcustom_sigdino_highres.json",
+        unet_checkpoint="ckpts/sdxl/unet/sdxl-unet.bin",
+        realcustom_checkpoint="ckpts/realcustom/RealCustom_highres.pth",
+        vae_config="ckpts/sdxl/vae/sdxl.json",
+        vae_checkpoint="ckpts/sdxl/vae/sdxl-vae.pth",
+    )
+    badges_text = r"""
+    <div style="text-align: center; display: flex; justify-content: left; gap: 5px;">
+    <a href="https://corleone-huang.github.io/RealCustom_plus_plus/"><img alt="Build" src="https://img.shields.io/badge/Project%20Page-RealCustom-yellow"></a>
+    <a href="https://arxiv.org/pdf/2408.09744?"><img alt="Build" src="https://img.shields.io/badge/arXiv%20paper-RealCustom-b31b1b.svg"></a>
+    </div>
+    """.strip()
+    with gr.Blocks() as demo:
+        gr.Markdown(f"# RealCustom")
+        gr.Markdown(badges_text)
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt", value="")
+                target_phrase = gr.Textbox(label="Target Phrase", value="")
+                with gr.Row():
+                    image_prompt = gr.Image(label="Ref Img", visible=True, interactive=True, type="pil")
+                with gr.Row():
+                    with gr.Column():
+                        width = gr.Slider(512, 2048, 1024, step=16, label="Gneration Width")
+                        height = gr.Slider(512, 2048, 1024, step=16, label="Gneration Height")
+                with gr.Accordion("Advanced Options", open=False):
+                    with gr.Row():
+                        guidance = gr.Slider(1.0, 15, 3.5, step=0.5, label="Guidance Scale", interactive=True)
+                        mask_scope = gr.Slider(0.05, 1.0, 0.2, step=0.05, label="Mask Scope", interactive=True)
+                        seed = gr.Number(0, label="Seed (-1 for random)")
+                        num = gr.Number(4, label="Generation Number")
+                        new_unet_local_path = gr.Textbox(label="New Unet Local Path", value="")
+                        new_realcustom_local_path = gr.Textbox(label="New RealCustom Local Path", value="")
+                generate_btn = gr.Button("Generate")
+            with gr.Column():
+                output_image = gr.Image(label="Generated Image")
+                output_mask = gr.Image(label="Guidance Mask")
+            inputs = [
+                prompt, image_prompt, target_phrase,
+                height, width, guidance, seed, num,
+                mask_scope,
+                new_unet_local_path, new_realcustom_local_path,
+            ]
+            generate_btn.click(
+                fn=pipeline.generation,
+                inputs=inputs,
+                outputs=[output_image, output_mask],
+            )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name='0.0.0.0', server_port=7860)

inference/inference_single_image.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import torch
+import json
+import os
+import torchvision
+from torchvision.utils import make_grid
+from torchvision.transforms.functional import to_pil_image
+from tqdm import tqdm
+from PIL import Image
+from models.text import TextModel
+from models.vae import AutoencoderKL
+from models.unet_2d_condition_custom import UNet2DConditionModel as UNet2DConditionModelDiffusers
+from schedulers.ddim import DDIMScheduler
+from schedulers.dpm_s import DPMSolverSingleStepScheduler
+from schedulers.utils import get_betas
+from inference_utils import find_phrase_positions_in_text, classifier_free_guidance_image_prompt_cascade
+from mask_generation import mask_generation
+from utils import instantiate_from_config
+# Argument parser
+parser = argparse.ArgumentParser()
+parser.add_argument("--width", type=int, default=512)
+parser.add_argument("--height", type=int, default=512)
+parser.add_argument("--samples_per_prompt", type=int, required=True)
+parser.add_argument("--nrow", type=int, default=4)
+parser.add_argument("--sample_steps", type=int, required=True)
+parser.add_argument("--schedule_type", type=str, default="squared_linear")                                            # default, `squared_linear
+parser.add_argument("--scheduler_type", type=str, default="dpm", choices=["ddim", "dpm"])                             # default, "dpm"
+parser.add_argument("--schedule_shift_snr", type=float, default=1)                                                    # default, 1
+parser.add_argument("--text_encoder_variant", type=str, nargs="+")
+parser.add_argument("--vae_config", type=str, default="configs/vae.json")                                             # default
+parser.add_argument("--vae_checkpoint", type=str, required=True)
+parser.add_argument("--unet_config", type=str, required=True)
+parser.add_argument("--unet_checkpoint", type=str, required=True)
+parser.add_argument("--unet_checkpoint_base_model", type=str, default="")
+parser.add_argument("--unet_prediction", type=str, choices=DDIMScheduler.prediction_types, default="epsilon")          # default, "epsilon"
+parser.add_argument("--negative_prompt", type=str, default="prompts/validation_negative.txt")                          # default
+parser.add_argument("--compile", action="store_true", default=False)
+parser.add_argument("--output_dir", type=str, required=True)
+parser.add_argument("--guidance_weight", type=float, default=7.5)
+parser.add_argument("--seed", type=int, default=666)
+parser.add_argument("--device", type=str, default="cuda")
+parser.add_argument("--text_prompt", type=str, required=True)
+parser.add_argument("--image_prompt_path", type=str, required=True)
+parser.add_argument("--target_phrase", type=str, required=True)
+parser.add_argument("--mask_scope", type=float, default=0.20)
+parser.add_argument("--mask_strategy",  type=str, nargs="+", default=["max_norm"])
+parser.add_argument("--mask_reused_step", type=int, default=12)
+args = parser.parse_args()
+# Initialize unet model
+with open(args.unet_config) as unet_config_file:
+    unet_config = json.load(unet_config_file)
+    # Settings for image encoder
+    vision_model_config = unet_config.pop("vision_model_config", None)
+    args.vision_model_config = vision_model_config.pop("vision_model_config", None)
+    unet_type = unet_config.pop("type", None)
+    unet_model = UNet2DConditionModelDiffusers(**unet_config)
+unet_model.eval().to(args.device)
+unet_model.load_state_dict(torch.load(args.unet_checkpoint, map_location=args.device), strict=False)
+print("loading unet model finished.")
+if args.unet_checkpoint_base_model != "":
+    if "safetensors" in args.unet_checkpoint_base_model:
+        from safetensors import safe_open
+        tensors = {}
+        with safe_open(args.unet_checkpoint_base_model, framework="pt", device='cpu') as f:
+            for k in f.keys():
+                new_k = k.replace("model.diffusion_model.", "")
+                tensors[k] = f.get_tensor(k)
+        unet_model.load_state_dict(tensors, strict=False)
+    else:
+        unet_model.load_state_dict(torch.load(args.unet_checkpoint_base_model, map_location=args.device), strict=False)
+unet_model = torch.compile(unet_model, disable=not args.compile)
+print("loading unet base model finished.")
+# Initialize vae model
+with open(args.vae_config) as vae_config_file:
+    vae_config = json.load(vae_config_file)
+vae_downsample_factor = 2 ** (len(vae_config["block_out_channels"]) - 1) # 2 ** 3 = 8
+vae_model = AutoencoderKL(**vae_config)
+vae_model.eval().to(args.device)
+vae_model.load_state_dict(torch.load(args.vae_checkpoint, map_location=args.device))
+vae_decoder = torch.compile(lambda x: vae_model.decode(x / vae_model.scaling_factor).sample.clip(-1, 1), disable=not args.compile)
+vae_encoder = torch.compile(lambda x: vae_model.encode(x).latent_dist.mode().mul_(vae_model.scaling_factor), disable=not args.compile)
+print("loading vae finished.")
+# Initialize ddim scheduler
+ddim_train_steps = 1000
+ddim_betas = get_betas(name=args.schedule_type, num_steps=ddim_train_steps, shift_snr=args.schedule_shift_snr, terminal_pure_noise=False)
+scheduler_class = DPMSolverSingleStepScheduler if args.scheduler_type == 'dpm' else DDIMScheduler
+scheduler = scheduler_class(betas=ddim_betas, num_train_timesteps=ddim_train_steps, num_inference_timesteps=args.sample_steps, device=args.device)
+infer_timesteps = scheduler.timesteps
+# Initialize text model
+text_model = TextModel(args.text_encoder_variant, ["penultimate_nonorm"])
+text_model.eval().to(args.device)
+print("loading text model finished.")
+# Initialize image model.
+vision_model = instantiate_from_config(args.vision_model_config)
+vision_model = vision_model.eval().to(args.device)
+print("loading image model finished.")
+negative_prompt = ""
+if args.negative_prompt:
+    with open(args.negative_prompt) as f:
+        negative_prompt = f.read().strip()
+image_metadata_validate = torch.tensor(
+    data=[
+        args.width,     # original_height
+        args.height,    # original_width
+        0,              # coordinate top
+        0,              # coordinate left
+        args.width,     # target_height
+        args.height,    # target_width
+    ],
+    device=args.device,
+    dtype=torch.float32
+).view(1, -1).repeat(args.samples_per_prompt, 1)
+# Create output directory
+os.makedirs(args.output_dir, exist_ok=True)
+args.output_image_grid_dir = os.path.join(args.output_dir, "images_grid")
+args.output_image_dir = os.path.join(args.output_dir, "images")
+args.output_mask_grid_dir = os.path.join(args.output_dir, "masks_grid")
+args.output_mask_dir = os.path.join(args.output_dir, "masks")
+os.makedirs(args.output_image_grid_dir, exist_ok=True)
+os.makedirs(args.output_image_dir, exist_ok=True)
+os.makedirs(args.output_mask_grid_dir, exist_ok=True)
+os.makedirs(args.output_mask_dir, exist_ok=True)
+with torch.no_grad():
+    # Prepare negative prompt.
+    if args.guidance_weight != 1:
+        text_negative_output = text_model(negative_prompt)
+    positive_prompt = args.text_prompt
+    positive_promt_image_path = args.image_prompt_path
+    target_phrase = args.target_phrase
+    # Compute target phrases
+    target_token = torch.zeros(1, 77).to(args.device)
+    positions = find_phrase_positions_in_text(positive_prompt, target_phrase)
+    for position in positions:
+        prompt_before = positive_prompt[:position] # NOTE We do not need -1 here because the SDXL text encoder does not encode the trailing space.
+        prompt_include = positive_prompt[:position+len(target_phrase)]
+        print("prompt before: ", prompt_before, ", prompt_include: ", prompt_include)
+        prompt_before_length = text_model.get_vaild_token_length(prompt_before) + 1
+        prompt_include_length = text_model.get_vaild_token_length(prompt_include) + 1
+        print("prompt_before_length: ", prompt_before_length, ", prompt_include_length: ", prompt_include_length)
+        target_token[:, prompt_before_length:prompt_include_length] = 1
+    # Text used for progress bar
+    pbar_text = positive_prompt[:40]
+    # Compute text embeddings
+    text_positive_output = text_model(positive_prompt)
+    text_positive_embeddings = text_positive_output.embeddings.repeat_interleave(args.samples_per_prompt, dim=0)
+    text_positive_pooled = text_positive_output.pooled[-1].repeat_interleave(args.samples_per_prompt, dim=0)
+    if args.guidance_weight != 1:
+        text_negative_embeddings = text_negative_output.embeddings.repeat_interleave(args.samples_per_prompt, dim=0)
+        text_negative_pooled = text_negative_output.pooled[-1].repeat_interleave(args.samples_per_prompt, dim=0)
+    # Compute image embeddings
+    positive_image = Image.open(positive_promt_image_path).convert("RGB")
+    positive_image = torchvision.transforms.ToTensor()(positive_image)
+    positive_image = positive_image.unsqueeze(0).repeat_interleave(args.samples_per_prompt, dim=0)
+    positive_image = torch.nn.functional.interpolate(
+        positive_image,
+        size=(768, 768),
+        mode="bilinear",
+        align_corners=False
+    )
+    negative_image = torch.zeros_like(positive_image)
+    print(positive_image.size(), negative_image.size())
+    positive_image = positive_image.to(args.device)
+    negative_image = negative_image.to(args.device)
+    positive_image_dict = {"image_ref": positive_image}
+    positive_image_output = vision_model(positive_image_dict, device=args.device)
+    negative_image_dict = {"image_ref": negative_image}
+    negative_image_output = vision_model(negative_image_dict, device=args.device)
+    # Initialize latent with input latent + noise (i2i) / pure noise (t2i)
+    latent = torch.randn(
+        size=[
+            args.samples_per_prompt,
+            vae_config["latent_channels"],
+            args.height // vae_downsample_factor,
+            args.width // vae_downsample_factor
+        ],
+        device=args.device,
+        generator=torch.Generator(args.device).manual_seed(args.seed))
+    target_h = (args.height // vae_downsample_factor) // 2
+    target_w = (args.width // vae_downsample_factor) // 2
+    # Real Reverse diffusion process.
+    text2image_crossmap_2d_all_timesteps_list = []
+    current_step = 0
+    for timestep in tqdm(iterable=infer_timesteps, desc=f"[{pbar_text}]", dynamic_ncols=True):
+        if current_step < args.mask_reused_step:
+            pred_cond, pred_cond_dict = unet_model(
+                sample=latent,
+                timestep=timestep,
+                encoder_hidden_states=text_positive_embeddings,
+                encoder_attention_mask=None,
+                added_cond_kwargs=dict(
+                    text_embeds=text_positive_pooled,
+                    time_ids=image_metadata_validate
+                ),
+                vision_input_dict=None,
+                vision_guided_mask=None,
+                return_as_origin=False,
+                return_text2image_mask=True,
+            )
+            crossmap_2d_avg = mask_generation(
+                crossmap_2d_list=pred_cond_dict["text2image_crossmap_2d"], selfmap_2d_list=pred_cond_dict.get("self_attention_map", []),
+                target_token=target_token, mask_scope=args.mask_scope,
+                mask_target_h=target_h, mask_target_w=target_w, mask_mode=args.mask_strategy,
+            )
+        else:
+            # using previous step's mask
+            crossmap_2d_avg = text2image_crossmap_2d_all_timesteps_list[-1].squeeze(1)
+        if crossmap_2d_avg.dim() == 5: # Means that each layer uses a separate mask weight.
+            text2image_crossmap_2d_all_timesteps_list.append(crossmap_2d_avg.mean(dim=2).unsqueeze(1))
+        else:
+            text2image_crossmap_2d_all_timesteps_list.append(crossmap_2d_avg.unsqueeze(1))
+        pred_cond, pred_cond_dict = unet_model(
+            sample=latent,
+            timestep=timestep,
+            encoder_hidden_states=text_positive_embeddings,
+            encoder_attention_mask=None,
+            added_cond_kwargs=dict(
+                text_embeds=text_positive_pooled,
+                time_ids=image_metadata_validate
+            ),
+            vision_input_dict=positive_image_output,
+            vision_guided_mask=crossmap_2d_avg,
+            return_as_origin=False,
+            return_text2image_mask=True,
+            multiple_reference_image=False
+        )
+        crossmap_2d_avg_neg = crossmap_2d_avg.mean(dim=1, keepdim=True)
+        pred_negative, pred_negative_dict = unet_model(
+            sample=latent,
+            timestep=timestep,
+            encoder_hidden_states=text_negative_embeddings,
+            encoder_attention_mask=None,
+            added_cond_kwargs=dict(
+                text_embeds=text_negative_pooled,
+                time_ids=image_metadata_validate
+            ),
+            vision_input_dict=negative_image_output,
+            vision_guided_mask=crossmap_2d_avg,
+            return_as_origin=False,
+            return_text2image_mask=True,
+            multiple_reference_image=False
+        )
+        pred = classifier_free_guidance_image_prompt_cascade(
+            pred_t_cond=None, pred_ti_cond=pred_cond, pred_uncond=pred_negative,
+            guidance_weight_t=args.guidance_weight, guidance_weight_i=args.guidance_weight,
+            guidance_stdev_rescale_factor=0, cfg_rescale_mode="naive_global_direct"
+        )
+        step = scheduler.step(
+            model_output=pred,
+            model_output_type=args.unet_prediction,
+            timestep=timestep,
+            sample=latent)
+        latent = step.prev_sample
+        current_step += 1
+    sample = vae_decoder(step.pred_original_sample)
+    # save each image
+    for sample_i in range(sample.size(0)):
+        sample_i_image = torch.clamp(sample[sample_i] * 0.5 + 0.5, min=0, max=1).float()
+        to_pil_image(sample_i_image).save(args.output_image_dir + "/output_{}.jpg".format(sample_i))
+    # save grid images
+    sample = make_grid(sample, normalize=True, value_range=(-1, 1), nrow=args.nrow).float()
+    to_pil_image(sample).save(args.output_image_grid_dir + "/grid_image.jpg")

inference/inference_single_image.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+# !/bin/bash
+# ----------------------------------------------------------------------------------------------------
+HEIGHT="1024"               # Base height.
+WIDTH="1024"                # Base width.
+SAMPLES_PER_PROMPT="4"      # Num of samples to generate per prompt.
+NROW="2"                    # Grid images per row.
+OUTPUT_DIR="outputs/test"
+# ----------------------------------------------------------------------------------------------------
+MASK_TYPE=("max_norm")
+# usually："max_norm" "crossmap_32" "selfmap_min_max_per_channel" "selfmap_64"
+# [
+#    "max_norm", "min_max_norm", "binary", "min_max_per_channel", "decoder_map"
+#    "selfmap", "selfmap_min_max_per_channel" "selfmap_64"
+# ]
+CFG=7.5
+STEPS=25
+mask_reused_step=12
+UNET_CONFIG="configs/realcustom_sigdino_highres.json"
+UNET_CHECKPOINT="ckpts/realcustom/RealCustom_0025000_ema_highres.pth"
+UNET_CHECKPOINT_BASE_MODEL="ckpts/sdxl/unet/general_v1-3_sdxl_03.pth"
+# ----------------------------------------------------------------------------------------------------
+CLIP1_DIR="ckpts/sdxl/clip-sdxl-1"
+CLIP2_DIR="ckpts/sdxl/clip-sdxl-2"
+VAE_CONFIG_PATH="ckpts/sdxl/vae/sdxl.json"
+VAE_CHECKPOINT_PATH="ckpts/sdxl/vae/sdxl-vae.pth"
+echo "Start inference"
+python3 inference/inference_single_image.py \
+    --width $WIDTH \
+    --height $HEIGHT \
+    --samples_per_prompt $SAMPLES_PER_PROMPT \
+    --nrow $NROW \
+    --sample_steps $STEPS \
+    --guidance_weight $CFG \
+    --text_encoder_variant \
+        $CLIP1_DIR \
+        $CLIP2_DIR \
+    --unet_config $UNET_CONFIG \
+    --unet_checkpoint $UNET_CHECKPOINT \
+    --unet_checkpoint_base_model $UNET_CHECKPOINT_BASE_MODEL \
+    --vae_config $VAE_CONFIG_PATH \
+    --vae_checkpoint $VAE_CHECKPOINT_PATH \
+    --output_dir $OUTPUT_DIR \
+    --seed 2024 \
+    --text_prompt "the figurine is flying in the sky" \
+    --image_prompt_path "prompts/figurine.png" \
+    --target_phrase "figurine" \
+    --mask_scope 0.25 \
+    --mask_strategy ${MASK_TYPE[*]}

inference/inference_utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def find_phrase_positions_in_text(text, phrase):
+    """
+    Return the position of the first character of the phrase in the text.
+    """
+    position = -1
+    positions = []
+    while True:
+        position = text.find(phrase, position + 1)
+        if position == -1:
+            break
+        positions.append(position)
+    return positions
+def classifier_free_guidance_image_prompt_cascade(
+    pred_t_cond, pred_ti_cond, pred_uncond, guidance_weight_t=7.5, guidance_weight_i=7.5,
+    guidance_stdev_rescale_factor=0.7, cfg_rescale_mode="none", super_cross_mask=None
+    ):
+    if cfg_rescale_mode == "none":
+        pred = pred_uncond + guidance_weight_t * (pred_t_cond - pred_uncond) + guidance_weight_i * (pred_ti_cond - pred_t_cond)
+    elif cfg_rescale_mode == "none_direct":
+        pred = pred_uncond + guidance_weight_i * (pred_ti_cond - pred_uncond)
+    elif cfg_rescale_mode == "naive":
+        assert super_cross_mask is not None
+        pred_std_t_before = pred_t_cond.std([1, 2, 3], keepdim=True)
+        pred_std_ti_before = pred_ti_cond.std([1, 2, 3], keepdim=True)
+        pred = pred_uncond + guidance_weight_t * (pred_t_cond - pred_uncond) + guidance_weight_i * (pred_ti_cond - pred_t_cond)
+        pred_std_after = pred.std([1, 2, 3], keepdim=True)
+        pred_rescale_t_factor = guidance_stdev_rescale_factor * (pred_std_t_before / pred_std_after) + (1 - guidance_stdev_rescale_factor)
+        pred_rescale_ti_factor = guidance_stdev_rescale_factor * (pred_std_ti_before / pred_std_after) + (1 - guidance_stdev_rescale_factor)
+        pred_ti = pred * super_cross_mask
+        pred_t = pred * (1 - super_cross_mask)
+        pred = pred_ti * pred_rescale_ti_factor + pred_t * pred_rescale_t_factor
+    elif cfg_rescale_mode == "naive_global":
+        pred_std_ti_before = pred_ti_cond.std([1, 2, 3], keepdim=True)
+        pred = pred_uncond + guidance_weight_t * (pred_t_cond - pred_uncond) + guidance_weight_i * (pred_ti_cond - pred_t_cond)
+        pred_std_after = pred.std([1, 2, 3], keepdim=True)
+        pred_rescale_ti_factor = guidance_stdev_rescale_factor * (pred_std_ti_before / pred_std_after) + (1 - guidance_stdev_rescale_factor)
+        pred = pred * pred_rescale_ti_factor
+    elif cfg_rescale_mode == "naive_global_direct":
+        pred_std_ti_before = pred_ti_cond.std([1, 2, 3], keepdim=True)
+        pred = pred_uncond + guidance_weight_i * (pred_ti_cond - pred_uncond)
+        pred_std_after = pred.std([1, 2, 3], keepdim=True)
+        pred_rescale_ti_factor = guidance_stdev_rescale_factor * (pred_std_ti_before / pred_std_after) + (1 - guidance_stdev_rescale_factor)
+        pred = pred * pred_rescale_ti_factor
+    else:
+        raise NotImplementedError()
+    return pred

inference/mask_generation.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+def mask_generation(
+    crossmap_2d_list, selfmap_2d_list=None,
+    target_token=None, mask_scope=None,
+    mask_target_h=64, mask_target_w=64,
+    mask_mode=["binary"],
+):
+    if len(selfmap_2d_list) > 0:
+        target_hw_selfmap = mask_target_h * mask_target_w
+        selfmap_2ds = []
+        for i in range(len(selfmap_2d_list)):
+            selfmap_ = selfmap_2d_list[i]
+            selfmap_ = F.interpolate(selfmap_, size=(target_hw_selfmap, target_hw_selfmap), mode='bilinear')
+            selfmap_2ds.append(selfmap_ )
+        selfmap_2ds = torch.cat(selfmap_2ds, dim=1)
+        if "selfmap_min_max_per_channel" in mask_mode:
+            selfmap_1ds = rearrange(selfmap_2ds, "b c h w -> b c (h w)")
+            channel_max_self = torch.max(selfmap_1ds, dim=-1, keepdim=True)[0].unsqueeze(-1)
+            channel_min_self = torch.min(selfmap_1ds, dim=-1, keepdim=True)[0].unsqueeze(-1)
+            selfmap_2ds = (selfmap_2ds - channel_min_self) / (channel_max_self - channel_min_self + 1e-6)
+        elif "selfmap_max_norm" in mask_mode:
+            selfmap_1ds = rearrange(selfmap_2ds, "b c h w -> b c (h w)")
+            b = selfmap_1ds.size(0)
+            batch_max = torch.max(selfmap_1ds.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(-1).unsqueeze(-1)
+            selfmap_2ds  = selfmap_2ds / (batch_max + 1e-10)
+        selfmap_2d = selfmap_2ds.mean(dim=1, keepdim=True)
+    else:
+        selfmap_2d = None
+    crossmap_2ds = []
+    for i in range(len(crossmap_2d_list)):
+        crossmap = crossmap_2d_list[i]
+        crossmap = crossmap.mean(dim=1)  # average on head dim
+        crossmap = crossmap * target_token.unsqueeze(-1).unsqueeze(-1) # target token valid
+        crossmap = crossmap.sum(dim=1, keepdim=True)
+        crossmap = F.interpolate(crossmap, size=(mask_target_h, mask_target_w), mode='bilinear')
+        crossmap_2ds.append(crossmap)
+    crossmap_2ds = torch.cat(crossmap_2ds, dim=1)
+    crossmap_1ds = rearrange(crossmap_2ds, "b c h w -> b c (h w)")
+    if "max_norm" in mask_mode:
+        crossmap_1d_avg = torch.mean(crossmap_1ds, dim=1, keepdim=True)  # [b, 1, (h w)]
+        if selfmap_2d is not None:
+            crossmap_1d_avg = torch.matmul(selfmap_2d, crossmap_1d_avg.unsqueeze(-1)).squeeze(-1)
+        b, c, n = crossmap_1ds.shape
+        batch_max = torch.max(crossmap_1d_avg.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(1)
+        crossmap_1d_avg = crossmap_1d_avg / (batch_max + 1e-6)
+    elif "min_max_norm" in mask_mode:
+        crossmap_1d_avg = torch.mean(crossmap_1ds, dim=1, keepdim=True)  # [b, 1, (h w)]
+        if selfmap_2d is not None:
+            crossmap_1d_avg = torch.matmul(selfmap_2d, crossmap_1d_avg.unsqueeze(-1)).squeeze(-1)
+        b, c, n = crossmap_1ds.shape
+        batch_max = torch.max(crossmap_1d_avg.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(1) # NOTE unsqueeze
+        batch_min = torch.min(crossmap_1d_avg.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(1) # NOTE unsqueeze
+        crossmap_1d_avg = (crossmap_1d_avg - batch_min) / (batch_max - batch_min + 1e-6)
+    elif "min_max_per_channel" in mask_mode:
+        channel_max = torch.max(crossmap_1ds, dim=-1, keepdim=True)[0]
+        channel_min = torch.min(crossmap_1ds, dim=-1, keepdim=True)[0]
+        crossmap_1ds = (crossmap_1ds - channel_min) / (channel_max - channel_min + 1e-6)
+        crossmap_1d_avg = torch.mean(crossmap_1ds, dim=1, keepdim=True)  # [b, 1, (h w)]
+        if selfmap_2d is not None:
+            crossmap_1d_avg = torch.matmul(selfmap_2d, crossmap_1d_avg.unsqueeze(-1)).squeeze(-1)
+        # renormalize to 0-1
+        b, c, n = crossmap_1d_avg.shape
+        batch_max = torch.max(crossmap_1d_avg.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(1)
+        batch_min = torch.min(crossmap_1d_avg.view(b, -1), dim=-1, keepdim=True)[0].unsqueeze(1)
+        crossmap_1d_avg = (crossmap_1d_avg - batch_min) / (batch_max - batch_min + 1e-6)
+    else:
+        crossmap_1d_avg = torch.mean(crossmap_1ds, dim=1, keepdim=True)  # [b, 1, (h w)]
+    if "threshold" in mask_mode:
+        threshold = 1 - mask_scope
+        crossmap_1d_avg[crossmap_1d_avg < threshold] = 0.0
+        if "binary" in mask_mode:
+            crossmap_1d_avg[crossmap_1d_avg > threshold] = 1.0
+    else:
+        # topk
+        topk_num = int(crossmap_1d_avg.size(-1) * mask_scope)
+        sort_score, sort_order = crossmap_1d_avg.sort(descending=True, dim=-1)
+        sort_topk = sort_order[:, :, :topk_num]
+        sort_topk_remain = sort_order[:, :, topk_num:]
+        crossmap_1d_avg = crossmap_1d_avg.scatter(2, sort_topk_remain, 0.)
+        if "binary" in mask_mode:
+            crossmap_1d_avg = crossmap_1d_avg.scatter(2, sort_topk, 1.0)
+    crossmap_2d_avg = rearrange(crossmap_1d_avg, "b c (h w) -> b c h w", h=mask_target_h, w=mask_target_w)
+    crossmap_2d_avg = crossmap_2d_avg
+    output = crossmap_2d_avg.unsqueeze(1)  # torch.Size([4, 1, 60, 64, 64]), The second dimension is the dimension of the number of reference images.
+    if output.size(2) == 1: # The dimension of the layer.
+        output = output.squeeze(2)  # If there is only a single dimension, then all layers will share the same mask.
+    return output

inference/pipeline.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import torch
+import torchvision
+from torchvision.utils import make_grid
+from torchvision.transforms.functional import to_pil_image
+from PIL import Image
+from models.text import TextModel
+from models.vae import AutoencoderKL
+from models.unet_2d_condition_custom import UNet2DConditionModel as UNet2DConditionModelDiffusers
+from schedulers.ddim import DDIMScheduler
+from schedulers.dpm_s import DPMSolverSingleStepScheduler
+from schedulers.utils import get_betas
+from inference_utils import find_phrase_positions_in_text, classifier_free_guidance_image_prompt_cascade
+from mask_generation import mask_generation
+from utils import instantiate_from_config
+from tqdm import tqdm
+from einops import rearrange
+class RealCustomInferencePipeline:
+    def __init__(
+        self,
+        unet_config,
+        unet_checkpoint,
+        realcustom_checkpoint,
+        vae_config="ckpts/sdxl/vae/sdxl.json",
+        vae_checkpoint="ckpts/sdxl/vae/sdxl-vae.pth",
+        model_type="bf16",
+        device="cuda",
+    ):
+        if model_type == "bf16":
+            self.torch_dtype = torch.bfloat16
+        else:
+            self.torch_dtype = torch.float32
+        if not os.path.exists("ckpts/"):
+            from huggingface_hub import snapshot_download
+            print("Downloading RealCustom ...")
+            snapshot_download(
+                repo_id="bytedance-research/RealCustom",
+                repo_type="model",
+                local_dir="ckpts",  # 指定本地目录
+                allow_patterns="ckpts/**",  # 只下载 ckpts 文件夹内容
+                local_dir_use_symlinks=False  # 直接存储文件而非符号链接
+            )
+        self.device = device
+        self.unet_checkpoint = unet_checkpoint
+        self.realcustom_checkpoint = realcustom_checkpoint
+        self._load_unet_checkpoint(unet_config, unet_checkpoint, realcustom_checkpoint)
+        self._load_vae_checkpoint(vae_config, vae_checkpoint)
+        self._load_encoder_checkpoint()
+        self._init_scheduler()
+        self._load_negative_prompt()
+    def _load_unet_checkpoint(self, unet_config, unet_checkpoint, realcustom_checkpoint):
+        # Initialize unet model
+        with open(unet_config) as unet_config_file:
+            unet_config = json.load(unet_config_file)
+        self.unet_prediction = "epsilon"
+        # Settings for image encoder
+        vision_model_config = unet_config.pop("vision_model_config", None)
+        self.vision_model_config = vision_model_config.pop("vision_model_config", None)
+        self.unet_model = UNet2DConditionModelDiffusers(**unet_config)
+        self.unet_model.eval().to(self.device).to(self.torch_dtype)
+        self.unet_model.load_state_dict(torch.load(unet_checkpoint, map_location=self.device), strict=False)
+        self.unet_model.load_state_dict(torch.load(realcustom_checkpoint, map_location=self.device), strict=False)
+        print("loading unet model finished.")
+    def _reload_unet_checkpoint(self, unet_checkpoint, realcustom_checkpoint):
+        self.unet_model.load_state_dict(torch.load(unet_checkpoint, map_location=self.device), strict=False)
+        self.unet_model.load_state_dict(torch.load(realcustom_checkpoint, map_location=self.device), strict=False)
+        print("reloading unet model finished.")
+    def _load_vae_checkpoint(self, vae_config, vae_checkpoint):
+        # Initialize vae model
+        with open(vae_config) as vae_config_file:
+            vae_config = json.load(vae_config_file)
+        self.latent_channels = vae_config["latent_channels"]
+        self.vae_downsample_factor = 2 ** (len(vae_config["block_out_channels"]) - 1) # 2 ** 3 = 8
+        vae_model = AutoencoderKL(**vae_config)
+        vae_model.eval().to(self.device).to(self.torch_dtype)
+        vae_model.load_state_dict(torch.load(vae_checkpoint, map_location=self.device))
+        self.vae_decoder = torch.compile(lambda x: vae_model.decode(x / vae_model.scaling_factor).sample.clip(-1, 1), disable=True)
+        self.vae_encoder = torch.compile(lambda x: vae_model.encode(x).latent_dist.mode().mul_(vae_model.scaling_factor), disable=True)
+        print("loading vae finished.")
+    def _load_encoder_checkpoint(self, ):
+        # Initialize text encoder
+        text_encoder_variant = ["ckpts/sdxl/clip-sdxl-1", "ckpts/sdxl/clip-sdxl-2"]
+        text_encoder_mode = ["penultimate_nonorm"]
+        self.text_model = TextModel(text_encoder_variant, text_encoder_mode)
+        self.text_model.eval().to(self.device).to(self.torch_dtype)
+        print("loading text model finished.")
+        # Initialize image encoder
+        self.vision_model = instantiate_from_config(self.vision_model_config)
+        self.vision_model.eval().to(self.device).to(self.torch_dtype)
+        print("loading image model finished.")
+    def _init_scheduler(self, ):
+        # Initialize ddim scheduler
+        ddim_train_steps = 1000
+        schedule_type = "squared_linear"
+        scheduler_type = "dpm"
+        schedule_shift_snr = 1
+        self.sample_steps = 25
+        ddim_betas = get_betas(name=schedule_type, num_steps=ddim_train_steps, shift_snr=schedule_shift_snr, terminal_pure_noise=False)
+        scheduler_class = DPMSolverSingleStepScheduler if scheduler_type == 'dpm' else DDIMScheduler
+        self.scheduler = scheduler_class(betas=ddim_betas, num_train_timesteps=ddim_train_steps, num_inference_timesteps=self.sample_steps, device=self.device)
+        self.infer_timesteps = self.scheduler.timesteps
+    def _load_negative_prompt(self, ):
+        with open("prompts/validation_negative.txt") as f:
+            self.negative_prompt = f.read().strip()
+        self.text_negative_output = self.text_model(self.negative_prompt)
+    def generation(
+        self,
+        text,
+        image_pil,
+        target_phrase,
+        height=1024,
+        width=1024,
+        guidance_scale=3.5,
+        seed=1234,
+        samples_per_prompt=4,
+        mask_scope=0.25,
+        new_unet_checkpoint="",          # in case you want to change
+        new_realcustom_checkpoint="",    # in case you want to change
+        mask_strategy=["min_max_per_channel"],
+        mask_reused_step=12,
+        return_each_image=False,
+    ):
+        if new_unet_checkpoint != "" and new_unet_checkpoint != self.unet_checkpoint:
+            self.unet_checkpoint = new_unet_checkpoint
+            self.unet_model.load_state_dict(torch.load(new_unet_checkpoint, map_location=self.device), strict=False)
+            print("Reloading Unet {} finised.".format(new_unet_checkpoint))
+        if new_realcustom_checkpoint != "" and new_realcustom_checkpoint != self.realcustom_checkpoint:
+            self.realcustom_checkpoint = new_realcustom_checkpoint
+            self.unet_model.load_state_dict(torch.load(new_realcustom_checkpoint, map_location=self.device), strict=False)
+            print("Reloading RealCustom {} finised.".format(new_realcustom_checkpoint))
+        samples_per_prompt = int(samples_per_prompt)
+        image_metadata_validate = self._get_metadata(height, width, samples_per_prompt)
+        if seed == -1:
+            seed = torch.randint(0, 1000000, (1,)).item()
+        seed = int(seed)
+        with torch.no_grad(), torch.autocast(self.device, self.torch_dtype):
+            target_token = self._find_phrase_positions_in_text(text, target_phrase)
+            # Compute text embeddings
+            text_positive_output = self.text_model(text)
+            text_positive_embeddings = text_positive_output.embeddings.repeat_interleave(samples_per_prompt, dim=0)
+            text_positive_pooled = text_positive_output.pooled[-1].repeat_interleave(samples_per_prompt, dim=0)
+            if guidance_scale != 1:
+                text_negative_embeddings = self.text_negative_output.embeddings.repeat_interleave(samples_per_prompt, dim=0)
+                text_negative_pooled = self.text_negative_output.pooled[-1].repeat_interleave(samples_per_prompt, dim=0)
+            # Compute image embeddings
+            # positive_image = Image.open(image_path).convert("RGB")
+            positive_image = image_pil
+            positive_image = torchvision.transforms.ToTensor()(positive_image)
+            positive_image = positive_image.unsqueeze(0).repeat_interleave(samples_per_prompt, dim=0)
+            positive_image = torch.nn.functional.interpolate(
+                positive_image,
+                size=(768, 768),
+                mode="bilinear",
+                align_corners=False
+            )
+            negative_image = torch.zeros_like(positive_image)
+            positive_image = positive_image.to(self.device).to(self.torch_dtype)
+            negative_image = negative_image.to(self.device).to(self.torch_dtype)
+            positive_image_dict = {"image_ref": positive_image}
+            positive_image_output = self.vision_model(positive_image_dict, device=self.device)
+            negative_image_dict = {"image_ref": negative_image}
+            negative_image_output = self.vision_model(negative_image_dict, device=self.device)
+            # Initialize latent with input latent
+            latent = torch.randn(
+                size=[
+                    samples_per_prompt,
+                    self.latent_channels,
+                    height // self.vae_downsample_factor,
+                    width // self.vae_downsample_factor
+                ],
+                device=self.device,
+                generator=torch.Generator(self.device).manual_seed(seed)).to(self.torch_dtype)
+            target_h = (height // self.vae_downsample_factor) // 2
+            target_w = (width // self.vae_downsample_factor) // 2
+            text2image_crossmap_2d_all_timesteps_list = []
+            current_step = 0
+            pbar_text = text[:40]
+            for timestep in tqdm(iterable=self.infer_timesteps, desc=f"[{pbar_text}]", dynamic_ncols=True):
+                if current_step < mask_reused_step:
+                    pred_cond, pred_cond_dict = self.unet_model(
+                        sample=latent,
+                        timestep=timestep,
+                        encoder_hidden_states=text_positive_embeddings,
+                        encoder_attention_mask=None,
+                        added_cond_kwargs=dict(
+                            text_embeds=text_positive_pooled,
+                            time_ids=image_metadata_validate
+                        ),
+                        vision_input_dict=None,
+                        vision_guided_mask=None,
+                        return_as_origin=False,
+                        return_text2image_mask=True,
+                    )
+                    crossmap_2d_avg = mask_generation(
+                        crossmap_2d_list=pred_cond_dict["text2image_crossmap_2d"], selfmap_2d_list=pred_cond_dict.get("self_attention_map", []),
+                        target_token=target_token, mask_scope=mask_scope,
+                        mask_target_h=target_h, mask_target_w=target_w, mask_mode=mask_strategy,
+                    )
+                else:
+                    # using previous step's mask
+                    crossmap_2d_avg = text2image_crossmap_2d_all_timesteps_list[-1].squeeze(1)
+                if crossmap_2d_avg.dim() == 5: # Means that each layer uses a separate mask weight.
+                    text2image_crossmap_2d_all_timesteps_list.append(crossmap_2d_avg.mean(dim=2).unsqueeze(1))
+                else:
+                    text2image_crossmap_2d_all_timesteps_list.append(crossmap_2d_avg.unsqueeze(1))
+                pred_cond, pred_cond_dict = self.unet_model(
+                    sample=latent,
+                    timestep=timestep,
+                    encoder_hidden_states=text_positive_embeddings,
+                    encoder_attention_mask=None,
+                    added_cond_kwargs=dict(
+                        text_embeds=text_positive_pooled,
+                        time_ids=image_metadata_validate
+                    ),
+                    vision_input_dict=positive_image_output,
+                    vision_guided_mask=crossmap_2d_avg,
+                    return_as_origin=False,
+                    return_text2image_mask=True,
+                    multiple_reference_image=False
+                )
+                pred_negative, pred_negative_dict = self.unet_model(
+                    sample=latent,
+                    timestep=timestep,
+                    encoder_hidden_states=text_negative_embeddings,
+                    encoder_attention_mask=None,
+                    added_cond_kwargs=dict(
+                        text_embeds=text_negative_pooled,
+                        time_ids=image_metadata_validate
+                    ),
+                    vision_input_dict=negative_image_output,
+                    vision_guided_mask=crossmap_2d_avg,
+                    return_as_origin=False,
+                    return_text2image_mask=True,
+                    multiple_reference_image=False
+                )
+                pred = classifier_free_guidance_image_prompt_cascade(
+                    pred_t_cond=None, pred_ti_cond=pred_cond, pred_uncond=pred_negative,
+                    guidance_weight_t=guidance_scale, guidance_weight_i=guidance_scale,
+                    guidance_stdev_rescale_factor=0, cfg_rescale_mode="naive_global_direct"
+                )
+                step = self.scheduler.step(
+                    model_output=pred,
+                    model_output_type=self.unet_prediction,
+                    timestep=timestep,
+                    sample=latent)
+                latent = step.prev_sample
+                current_step += 1
+            sample = self.vae_decoder(step.pred_original_sample)
+        # save each image
+        images_pil_list = []
+        for sample_i in range(sample.size(0)):
+            sample_i_image = torch.clamp(sample[sample_i] * 0.5 + 0.5, min=0, max=1).float()
+            images_pil_list.append(to_pil_image(sample_i_image))
+            # to_pil_image(sample_i_image).save("./test_{}.jpg".format(sample_i))
+        # save grid images
+        sample = make_grid(sample, normalize=True, value_range=(-1, 1), nrow=int(samples_per_prompt ** 0.5)).float()
+        # to_pil_image(sample).save("./output_grid_image.jpg")
+        # save all masks
+        text2image_crossmap_2d_all_timesteps = torch.cat(text2image_crossmap_2d_all_timesteps_list, dim=1)
+        text2image_crossmap_2d_all_timesteps = rearrange(text2image_crossmap_2d_all_timesteps, "b t c h w -> (b t) c h w")
+        c = text2image_crossmap_2d_all_timesteps.size(1)
+        text2image_crossmap_2d_all_timesteps = rearrange(text2image_crossmap_2d_all_timesteps, "B (c 1) h w -> (B c) 1 h w")
+        sample_mask = make_grid(text2image_crossmap_2d_all_timesteps, normalize=False, value_range=(-1, 1), nrow=int(self.sample_steps * c))
+        # to_pil_image(sample_mask).save("./output_grid_mask.jpg")
+        if return_each_image:
+            return images_pil_list, to_pil_image(sample), to_pil_image(sample_mask)
+        else:
+            return to_pil_image(sample), to_pil_image(sample_mask)
+    def _get_metadata(self, height, width, samples_per_prompt):
+        image_metadata_validate = torch.tensor(
+            data=[
+                width,     # original_height
+                height,    # original_width
+                0,         # coordinate top
+                0,         # coordinate left
+                width,     # target_height
+                height,    # target_width
+            ],
+            device=self.device,
+            dtype=self.torch_dtype
+        ).view(1, -1).repeat(samples_per_prompt, 1)
+        return image_metadata_validate
+    def _find_phrase_positions_in_text(self, text, target_phrase):
+        # Compute target phrases
+        target_token = torch.zeros(1, 77).to(self.device)
+        positions = find_phrase_positions_in_text(text, target_phrase)
+        for position in positions:
+            prompt_before = text[:position] # NOTE We do not need -1 here because the SDXL text encoder does not encode the trailing space.
+            prompt_include = text[:position+len(target_phrase)]
+            print("prompt before: ", prompt_before, ", prompt_include: ", prompt_include)
+            prompt_before_length = self.text_model.get_vaild_token_length(prompt_before) + 1
+            prompt_include_length = self.text_model.get_vaild_token_length(prompt_include) + 1
+            print("prompt_before_length: ", prompt_before_length, ", prompt_include_length: ", prompt_include_length)
+            target_token[:, prompt_before_length:prompt_include_length] = 1
+        return target_token

models/__pycache__/attention_custom.cpython-310.pyc ADDED Viewed

Binary file (12.5 kB). View file

models/__pycache__/attention_processor_custom_cross.cpython-310.pyc ADDED Viewed

Binary file (38.7 kB). View file

models/__pycache__/base_vision.cpython-310.pyc ADDED Viewed

Binary file (8.28 kB). View file

models/__pycache__/dino.cpython-310.pyc ADDED Viewed

Binary file (7.08 kB). View file

models/__pycache__/image_encoder_siglipdino_shallowdeep.cpython-310.pyc ADDED Viewed

Binary file (4.23 kB). View file

models/__pycache__/projectors.cpython-310.pyc ADDED Viewed

Binary file (4.33 kB). View file

models/__pycache__/sigclip.cpython-310.pyc ADDED Viewed

Binary file (5.81 kB). View file

models/__pycache__/text.cpython-310.pyc ADDED Viewed

Binary file (2.88 kB). View file

models/__pycache__/transformer_2d_custom.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

models/__pycache__/unet_2d_blocks_custom.cpython-310.pyc ADDED Viewed

Binary file (52.5 kB). View file

models/__pycache__/unet_2d_condition_custom.cpython-310.pyc ADDED Viewed

Binary file (31.1 kB). View file

models/__pycache__/vae.cpython-310.pyc ADDED Viewed

Binary file (1.08 kB). View file

models/attention_custom.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+# from diffusers.utils import maybe_allow_in_graph
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import Attention
+from models.attention_processor_custom_cross import Attention as CrossAttention
+from diffusers.models.embeddings import CombinedTimestepLabelEmbeddings
+from utils import update_dict
+# @maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        image_prompt_settings: dict = {},
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            # self.attn2 = CrossAttention(
+            # self.attn2 = SelfAttention(
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                image_prompt_settings=image_prompt_settings,
+            )
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        encoder_hidden_states_vision = None,
+        encoder_hidden_states_control = None,
+        vision_guided_mask = None,
+        extra_dict_inputs = {},
+        height = None,
+        width = None,
+        return_self_attn_map = False,
+    ):
+        extra_dict_outputs = {}
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        # self attention in XL
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            # vision_guided_mask=vision_guided_mask,
+            # height=height,
+            # width=width,
+            # return_self_attn_map=return_self_attn_map,
+            # **cross_attention_kwargs,
+        )
+        # extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_attn)
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output, extra_dict_output_attn = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                encoder_hidden_states_vision=encoder_hidden_states_vision,
+                encoder_hidden_states_control=encoder_hidden_states_control,
+                vision_guided_mask=vision_guided_mask,
+                extra_dict_inputs=extra_dict_inputs,
+                height=height,
+                width=width,
+                **cross_attention_kwargs,
+            )
+            extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_attn)
+            # attn_output = self.attn2(
+            #     norm_hidden_states,
+            #     encoder_hidden_states=encoder_hidden_states,
+            #     attention_mask=encoder_attention_mask,
+            #     **cross_attention_kwargs,
+            # )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states, extra_dict_outputs
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+        self.approximate = approximate
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class AdaLayerNormZero(nn.Module):
+    """
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, class_labels, hidden_dtype=None):
+        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaGroupNorm(nn.Module):
+    """
+    GroupNorm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(
+        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
+    ):
+        super().__init__()
+        self.num_groups = num_groups
+        self.eps = eps
+        if act_fn is None:
+            self.act = None
+        else:
+            self.act = get_activation(act_fn)
+        self.linear = nn.Linear(embedding_dim, out_dim * 2)
+    def forward(self, x, emb):
+        if self.act:
+            emb = self.act(emb)
+        emb = self.linear(emb)
+        emb = emb[:, :, None, None]
+        scale, shift = emb.chunk(2, dim=1)
+        x = F.group_norm(x, self.num_groups, eps=self.eps)
+        x = x * (1 + scale) + shift
+        return x

models/attention_processor_custom_cross.py ADDED Viewed

	@@ -0,0 +1,1778 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+# from diffusers.utils import deprecate, logging, maybe_allow_in_graph
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+import random
+from utils import zero_module
+# from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+# Cross Attention
+# @maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block=False,
+        processor: Optional["AttnProcessor"] = None,
+        image_prompt_settings: dict = {}
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.inner_dim = inner_dim
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+        # NOTE for custom dual branch settings
+        self.cross_attention_id = image_prompt_settings.get("cross_attention_id", 0)
+        self.use_cross_attention_id = image_prompt_settings.get("use_cross_attention_id", False)
+        self.image_prompt_mode = image_prompt_settings.get("image_prompt_mode", "none")
+        if self.image_prompt_mode == "naive": # only used in cross-attention, NOT self-attention
+            self.to_k_vision = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+            self.to_v_vision = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        else:
+            if self.image_prompt_mode != "none":
+                print("Warning .... unknown self.image_prompt_mode")
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        # <note> processor default to be None
+        if processor is None:
+            assert hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+            processor = AttnProcessor2_0_image_prompt()
+        self.set_processor(processor)
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ):
+        is_lora = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor),
+        )
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+                LoRAAttnAddedKVProcessor,
+            ),
+        )
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and (is_lora or is_custom_diffusion):
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for LoRA or custom diffuson for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+            if is_lora:
+                # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
+                # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
+                processor = LoRAXFormersAttnProcessor(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_lora:
+                attn_processor_class = (
+                    LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
+                )
+                processor = attn_processor_class(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    AttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+        self.set_processor(processor)
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor"):
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+        self.processor = processor
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None,
+                encoder_hidden_states_vision=None, encoder_hidden_states_control=None,
+                vision_guided_mask=None, extra_dict_inputs={}, height=None, width=None,
+                **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states_vision=encoder_hidden_states_vision,
+            encoder_hidden_states_control=encoder_hidden_states_control,
+            vision_guided_mask=vision_guided_mask,
+            extra_dict_inputs=extra_dict_inputs,
+            image_prompt_mode=self.image_prompt_mode,
+            height=height, width=width,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def head_to_batch_dim(self, tensor, out_dim=3):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(self, query, key, attention_mask=None):
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
+        if batch_size is None:
+            deprecate(
+                "batch_size=None",
+                "0.0.15",
+                (
+                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
+                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
+                    " `prepare_attention_mask` when preparing the attention_mask."
+                ),
+            )
+            batch_size = 1
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(self, encoder_hidden_states):
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+class AttnProcessor2_0_image_prompt:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        encoder_hidden_states_vision=None,
+        encoder_hidden_states_control=None,
+        vision_guided_mask=None,
+        extra_dict_inputs={},
+        image_prompt_mode="none",
+        height=None, width=None,
+    ):
+        if "multiple_reference_image" in extra_dict_inputs.keys():
+            multiple_reference_image = extra_dict_inputs["multiple_reference_image"]
+        else:
+            multiple_reference_image = False
+        resampled_token = None
+        if encoder_hidden_states_vision is not None:
+            if attn.use_cross_attention_id:
+                encoder_hidden_states_vision = encoder_hidden_states_vision[:, attn.cross_attention_id, :, :]
+        extra_dict_outputs = {}
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:  # attn.norm_cross:  None
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # text-imgae cross attention
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        if attn.training:
+            # with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            #     hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+            # hidden_states = flash_attn_func(query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2), dropout_p=0.0, softmax_scale=None, causal=False, window_size=(-1, -1), alibi_slopes=None, deterministic=False)
+        else: # use vanilla attention during inference
+            with torch.autocast(enabled=True, device_type = 'cuda'):
+                q, k, v = query.float(), key.float(), value.float()
+                sim = (q @ k.transpose(-2, -1) * attn.scale)
+                if attention_mask is not None: # no mask in SDXL?
+                    attention_mask = 1 + (attention_mask / -10000.0)
+                    attention_mask = attention_mask.bool()
+                    max_neg_value = -torch.finfo(sim.dtype).max
+                    sim.masked_fill_(~attention_mask, max_neg_value)
+                sim = sim.softmax(dim=-1)
+                hidden_states = torch.einsum('b h i j, b h j d -> b h i d', sim, v)
+            extra_dict_outputs["text2image_crossmap_2d"] = rearrange(sim, "b head (h w) n -> b head n h w", h=height, w=width)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states_vision is not None and not multiple_reference_image: # single image
+            if image_prompt_mode == "naive":
+                key_vision = attn.to_k_vision(encoder_hidden_states_vision)
+                value_vision = attn.to_v_vision(encoder_hidden_states_vision)
+                key_vision = key_vision.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                value_vision = value_vision.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                hidden_states_vision = F.scaled_dot_product_attention(query, key_vision, value_vision, dropout_p=0.0, is_causal=False)
+                hidden_states_vision = hidden_states_vision.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            else:
+                hidden_states_vision = torch.zeros_like(hidden_states).to(hidden_states.device)
+            if vision_guided_mask is not None:
+                if vision_guided_mask.dim() == 4: # 所有层共用相同的mask
+                    target_h, target_w = vision_guided_mask.size(-2), vision_guided_mask.size(-1)
+                    vision_guided_mask = F.interpolate(vision_guided_mask.float(), scale_factor=height/target_h, mode='bilinear')
+                    vision_guided_mask_1d = rearrange(vision_guided_mask, "b c h w -> b (h w) c")
+                    hidden_states_vision = hidden_states_vision * vision_guided_mask_1d
+                else: # according to different self.cross_attention_id, 每一层用单独的mask
+                    vision_guided_mask = vision_guided_mask[:, :, attn.cross_attention_id, :, :]
+                    target_h, target_w = vision_guided_mask.size(-2), vision_guided_mask.size(-1)
+                    vision_guided_mask = F.interpolate(vision_guided_mask, scale_factor=height/target_h, mode='bilinear')
+                    vision_guided_mask_1d = rearrange(vision_guided_mask, "b c h w -> b (h w) c")
+                    hidden_states_vision = hidden_states_vision * vision_guided_mask_1d
+        elif encoder_hidden_states_vision is not None and multiple_reference_image: # multiple image
+            if image_prompt_mode == "naive":
+                image_num = encoder_hidden_states_vision.size(1)
+                encoder_hidden_states_vision_list = encoder_hidden_states_vision.chunk(image_num, dim=1)
+                if vision_guided_mask is not None:
+                    vision_guided_mask_list = vision_guided_mask.chunk(image_num, dim=1)
+                else:
+                    vision_guided_mask_list = [None] * image_num
+                hidden_states_vision_results = []
+                for encoder_hidden_states_vision_i, vision_guided_mask_i in zip(encoder_hidden_states_vision_list, vision_guided_mask_list):
+                    encoder_hidden_states_vision_i = encoder_hidden_states_vision_i.squeeze(1)
+                    key_vision = attn.to_k_vision(encoder_hidden_states_vision_i)
+                    value_vision = attn.to_v_vision(encoder_hidden_states_vision_i)
+                    key_vision = key_vision.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    value_vision = value_vision.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                    hidden_states_vision = F.scaled_dot_product_attention(query, key_vision, value_vision, dropout_p=0.0, is_causal=False)
+                    hidden_states_vision = hidden_states_vision.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+                    if vision_guided_mask_i is not None:
+                        target_h, target_w = vision_guided_mask_i.size(-2), vision_guided_mask_i.size(-1)
+                        vision_guided_mask_i = F.interpolate(vision_guided_mask_i, scale_factor=height/target_h, mode='bilinear')
+                        vision_guided_mask_1d_i = rearrange(vision_guided_mask_i, "b c h w -> b (h w) c")
+                        hidden_states_vision = hidden_states_vision * vision_guided_mask_1d_i
+                    hidden_states_vision_results.append(hidden_states_vision.unsqueeze(1))
+                hidden_states_vision = torch.cat(hidden_states_vision_results, dim=1).sum(dim=1)
+            else:
+                hidden_states_vision = torch.zeros_like(hidden_states).to(hidden_states.device)
+        else:
+            hidden_states_vision = torch.zeros_like(hidden_states).to(hidden_states.device)
+        hidden_states = hidden_states + hidden_states_vision
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states, extra_dict_outputs
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRALinearLayer(nn.Module):
+    def __init__(self, in_features, out_features, rank=4, network_alpha=None):
+        super().__init__()
+        if rank > min(in_features, out_features):
+            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
+        self.down = nn.Linear(in_features, rank, bias=False)
+        self.up = nn.Linear(rank, out_features, bias=False)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states):
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class LoRAAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class CustomDiffusionAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing attention for the Custom Diffusion method.
+    Args:
+        train_kv (`bool`, defaults to `True`):
+            Whether to newly train the key and value matrices corresponding to the text features.
+        train_q_out (`bool`, defaults to `True`):
+            Whether to newly train query matrices corresponding to the latent image features.
+        hidden_size (`int`, *optional*, defaults to `None`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        out_bias (`bool`, defaults to `True`):
+            Whether to include the bias parameter in `train_q_out`.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+    """
+    def __init__(
+        self,
+        train_kv=True,
+        train_q_out=True,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class AttnAddedKVProcessor:
+    r"""
+    Processor for performing attention-related computations with extra learnable key and value matrices for the text
+    encoder.
+    """
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class AttnAddedKVProcessor2_0:
+    r"""
+    Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra
+    learnable key and value matrices for the text encoder.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query, out_dim=4)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key, out_dim=4)
+            value = attn.head_to_batch_dim(value, out_dim=4)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class LoRAAttnAddedKVProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
+    encoder.
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
+            encoder_hidden_states
+        )
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
+            encoder_hidden_states
+        )
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
+            value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class XFormersAttnAddedKVProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class XFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAXFormersAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+    """
+    def __init__(
+        self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None, network_alpha=None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.attention_op = attention_op
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product
+    attention.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class CustomDiffusionXFormersAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+    Args:
+    train_kv (`bool`, defaults to `True`):
+        Whether to newly train the key and value matrices corresponding to the text features.
+    train_q_out (`bool`, defaults to `True`):
+        Whether to newly train query matrices corresponding to the latent image features.
+    hidden_size (`int`, *optional*, defaults to `None`):
+        The hidden size of the attention layer.
+    cross_attention_dim (`int`, *optional*, defaults to `None`):
+        The number of channels in the `encoder_hidden_states`.
+    out_bias (`bool`, defaults to `True`):
+        Whether to include the bias parameter in `train_q_out`.
+    dropout (`float`, *optional*, defaults to 0.0):
+        The dropout probability to use.
+    attention_op (`Callable`, *optional*, defaults to `None`):
+        The base
+        [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use
+        as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator.
+    """
+    def __init__(
+        self,
+        train_kv=True,
+        train_q_out=False,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.attention_op = attention_op
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class SlicedAttnProcessor:
+    r"""
+    Processor for implementing sliced attention.
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class SlicedAttnAddedKVProcessor:
+    r"""
+    Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder.
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+    def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+AttentionProcessor = Union[
+    AttnProcessor,
+    AttnProcessor2_0_image_prompt,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    XFormersAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAAttnAddedKVProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+]
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
+    """
+    def __init__(
+        self,
+        f_channels,
+        zq_channels,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, f, zq):
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f

models/base_vision.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+base_vision.py
+Abstract class definition of a Vision Backbone (Visual Featurizer), with full annotations of class methods, utility
+functions, and initialization logic.
+We also define the generic TimmViTBackbone class here, providing a default interface for loading any TIMM Vision
+Transformer model for feature extraction.
+"""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Protocol, Tuple, Union
+import timm
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as TVF
+from PIL.Image import Image
+from timm.models.vision_transformer import Block, VisionTransformer
+from torch.distributed.fsdp.wrap import _module_wrap_policy, _or_policy, transformer_auto_wrap_policy
+from torchvision.transforms import Compose, Resize
+# === Utility Functions for Monkey-Patching ===
+def unpack_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        result = fn(*args, **kwargs)
+        return result[0] if (isinstance(result, tuple) or isinstance(result, list)) else result
+    return wrapper
+def return_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        result = fn(*args, **kwargs)
+        return result
+    return wrapper
+# === Interface for an Image Transform ===
+class ImageTransform(Protocol):
+    def __call__(self, img: Image, **kwargs: str) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: ...
+# === Custom Torchvision Image Transforms ===
+@dataclass
+class LetterboxPad:
+    padding_fill_value: Tuple[int, int, int]
+    def __call__(self, image: Image) -> Image:
+        """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
+        (w, h), max_wh = image.size, max(image.size)
+        horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
+        padding = (horizontal_pad, vertical_pad, horizontal_pad, vertical_pad)
+        return TVF.pad(image, padding, fill=self.padding_fill_value, padding_mode="constant")
+# === Abstract Base Class for arbitrary Vision Backbones ===
+class VisionBackbone(nn.Module, ABC):
+    def __init__(self, vision_backbone_id: str, image_resize_strategy: str, default_image_size: int = 224) -> None:
+        super().__init__()
+        self.identifier: str = vision_backbone_id
+        self.image_resize_strategy: str = image_resize_strategy
+        self.default_image_size: int = default_image_size
+        # Instance attributes for a Vision Backbone
+        self.featurizer: nn.Module = None
+        self.image_transform: ImageTransform = None
+    def get_image_transform(self) -> ImageTransform:
+        return self.image_transform
+    @abstractmethod
+    def get_fsdp_wrapping_policy(self) -> Callable: ...
+    @abstractmethod
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """Run a forward pass through the featurizer given a set of processed images, returning patch/grid features."""
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def default_image_resolution(self) -> Tuple[int, int, int]: ...
+    @property
+    @abstractmethod
+    def embed_dim(self) -> int: ...
+    @property
+    @abstractmethod
+    def num_patches(self) -> int: ...
+    @property
+    @abstractmethod
+    def half_precision_dtype(self) -> torch.dtype: ...
+# === Abstract Base Class for Arbitrary TIMM Vision Transformer Backbones ===
+class TimmViTBackbone(VisionBackbone, ABC):
+    def __init__(
+        self,
+        vision_backbone_id: str,
+        timm_path_or_url: str,
+        image_resize_strategy: str,
+        default_image_size: int = 224,
+        override_act_layer: Optional[str] = None,
+    ) -> None:
+        super().__init__(vision_backbone_id, image_resize_strategy, default_image_size=default_image_size)
+        self.timm_path_or_url = timm_path_or_url
+        self.override_act_layer = override_act_layer
+        self.dtype = torch.bfloat16
+        # Initialize Featurizer (ViT) by downloading from HF / TIMM Hub if necessary
+        if self.override_act_layer is None:
+            self.featurizer: VisionTransformer = timm.create_model(
+                self.timm_path_or_url, pretrained=True, num_classes=0, img_size=self.default_image_size,
+            )
+        else:
+            self.featurizer: VisionTransformer = timm.create_model(
+                self.timm_path_or_url,
+                pretrained=True,
+                num_classes=0,
+                img_size=self.default_image_size,
+                act_layer=self.override_act_layer,
+            )
+        self.featurizer.eval()
+        # Monkey-Patch the `forward()` function of the featurizer to ensure FSDP-compatibility
+        #   => Note: By default set `get_intermediate_layers` to return the *SECOND-TO-LAST* layer patches!
+        #   => TODO (siddk) Remove after resolution of https://github.com/pytorch/pytorch/issues/109385
+        self.featurizer.forward = unpack_tuple(
+            partial(self.featurizer.get_intermediate_layers, n={len(self.featurizer.blocks) - 2})
+        )
+        # Validation =>> for now, this class *only* supports TIMM Vision Transformers (but can be extended!)
+        assert isinstance(self.featurizer, VisionTransformer), (
+            "Featurizer is not a TIMM VisionTransformer; if you would like to support a new visual representation, "
+            "file an issue or implement the requisite logic (see `cobra/models/backbones/vision/base_vision.py`)!"
+        )
+        # Get Config =>> Note :: Override default image size to ensure correct image transform
+        self.data_cfg = timm.data.resolve_model_data_config(self.featurizer)
+        self.data_cfg["input_size"] = (3, self.default_image_size, self.default_image_size)
+        # Initialize Default Image Transform --> Modified by `self.image_resize_strategy`
+        default_image_transform = timm.data.create_transform(**self.data_cfg, is_training=False)
+        # Fix =>> SigLIP & IN1K default transforms resize to *larger* than `self.default_image_size` (crops image)!
+        if "siglip" in self.timm_path_or_url or "in1k" in self.timm_path_or_url:
+            assert isinstance(default_image_transform, Compose), "Unexpected `default_image_transform`!"
+            assert isinstance(resize_transform := default_image_transform.transforms[0], Resize)
+            default_image_transform = Compose(
+                [
+                    Resize(self.default_image_size, interpolation=resize_transform.interpolation),
+                    *default_image_transform.transforms[1:],
+                ]
+            )
+        # Switch on `image_resize_strategy`
+        if self.image_resize_strategy == "resize-naive":
+            assert isinstance(default_image_transform, Compose), "Unexpected `default_image_transform`!"
+            assert isinstance(resize_transform := default_image_transform.transforms[0], Resize)
+            target_size = (self.default_image_size, self.default_image_size)
+            self.image_transform = Compose(
+                [
+                    Resize(target_size, interpolation=resize_transform.interpolation),
+                    *default_image_transform.transforms[1:],
+                ]
+            )
+        elif self.image_resize_strategy == "resize-crop":
+            self.image_transform = default_image_transform
+        elif self.image_resize_strategy == "letterbox":
+            assert isinstance(default_image_transform, Compose), "Unexpected `default_image_transform`!"
+            assert "mean" in self.data_cfg, "TIMM `data_cfg` missing image normalization mean!"
+            # Compute Padding Fill Value (rescaled normalization mean if applicable)
+            fill = tuple([int(x * 255) for x in self.data_cfg["mean"]])
+            # Build New Transform
+            self.image_transform = Compose([LetterboxPad(fill), *default_image_transform.transforms])
+        else:
+            raise ValueError(f"Image Resize Strategy `{self.image_resize_strategy}` is not supported!")
+    def get_fsdp_wrapping_policy(self) -> Callable:
+        """Return a simple FSDP policy that wraps each ViT block and then the _entire_ featurizer."""
+        vit_wrap_policy = partial(_module_wrap_policy, module_classes={VisionTransformer})
+        transformer_block_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block})
+        return partial(_or_policy, policies=[vit_wrap_policy, transformer_block_policy])
+    def forward(self, pixel_values: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> torch.Tensor:
+        """Runs transformed image/pixel tensor through vision backbone, returning _all_ patch features."""
+        return self.featurizer(pixel_values)
+    @property
+    def default_image_resolution(self) -> Tuple[int, int, int]:
+        return self.data_cfg["input_size"]
+    @property
+    def embed_dim(self) -> int:
+        return self.featurizer.embed_dim
+    @property
+    def num_patches(self) -> int:
+        return self.featurizer.patch_embed.num_patches
+    @property
+    def half_precision_dtype(self) -> torch.dtype:
+        return self.dtype

models/dino.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+dinosiglip_vit.py
+Vision backbone that returns concatenated features from both DINOv2 and SigLIP.
+"""
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, Dict, Tuple
+import os
+import timm
+import torch
+from PIL import Image
+from einops import rearrange
+from timm.models.vision_transformer import Block, VisionTransformer
+from torch.distributed.fsdp.wrap import _module_wrap_policy, _or_policy, transformer_auto_wrap_policy
+from torchvision.transforms import Compose, Resize
+from models.base_vision import ImageTransform, LetterboxPad, VisionBackbone, unpack_tuple, return_tuple
+import torch.nn as nn
+import torchvision
+@dataclass
+class DinoSigLIPImageTransform:
+    dino_image_transform: ImageTransform
+    siglip_image_transform: ImageTransform
+    is_cobra: bool = True
+    def __call__(self, img: Image, **kwargs: str) -> Dict[str, torch.Tensor]:
+        return {"dino": self.dino_image_transform(img, **kwargs).unsqueeze(0), "siglip": self.siglip_image_transform(img, **kwargs).unsqueeze(0)}
+class DinoViTBackbone(VisionBackbone):
+    def __init__(self, backbone_name_or_path, image_resize_strategy: str, default_image_size: int = 224, last_n = 2, feature_index = 22) -> None:
+        super().__init__(backbone_name_or_path, image_resize_strategy, default_image_size=default_image_size)
+        # load from local paths
+        dino_pretrained_cfg = timm.models.create_model(backbone_name_or_path).default_cfg
+        dino_pretrained_cfg['file'] = 'ckpts/vit_large_patch14_reg4_dinov2.lvd142m/pytorch_model.bin'
+        # Initialize both Featurizers (ViTs) by downloading from HF / TIMM Hub if necessary
+        self.dino_featurizer: VisionTransformer = timm.create_model(
+            backbone_name_or_path, pretrained=True, num_classes=0, img_size=self.default_image_size,
+            pretrained_cfg=dino_pretrained_cfg
+        )
+        self.dino_featurizer.eval()
+        # Monkey-Patch the `forward()` function of the featurizers to ensure FSDP-compatibility
+        #   => Note: By default set `get_intermediate_layers` to return the *SECOND-TO-LAST* layer patches!
+        #   => TODO (siddk) Remove after resolution of https://github.com/pytorch/pytorch/issues/109385
+        # return the output tokens from the `n` last blocks
+        print("dino has {} layer intermediate features. ".format(len(self.dino_featurizer.blocks))) # 24
+        # self.dino_featurizer.forward = unpack_tuple(
+        #     partial(self.dino_featurizer.get_intermediate_layers, n={len(self.dino_featurizer.blocks) - last_n})
+        # )
+        if isinstance(feature_index, tuple) or isinstance(feature_index, list):
+            feature_index = set(feature_index)
+        else:
+            feature_index = {feature_index}
+        self.dino_featurizer.forward = return_tuple(
+            partial(self.dino_featurizer.get_intermediate_layers, n=feature_index)
+        )
+        # Get Configs for _both_ Featurizers =>> Note :: Override default image size for larger resolution models
+        self.dino_data_cfg = timm.data.resolve_model_data_config(self.dino_featurizer)
+        self.dino_data_cfg["input_size"] = (3, self.default_image_size, self.default_image_size)
+        # Initialize *both* Transforms
+        default_dino_transform = timm.data.create_transform(**self.dino_data_cfg, is_training=False)
+        if self.image_resize_strategy == "resize-naive":
+            assert isinstance(default_dino_transform, Compose), "Unexpected `default_dino_image_transform`!"
+            assert isinstance(dino_resize_transform := default_dino_transform.transforms[0], Resize)
+            target_size = (self.default_image_size, self.default_image_size)
+            dino_transform = Compose(
+                [
+                    Resize(target_size, interpolation=dino_resize_transform.interpolation),
+                    *default_dino_transform.transforms[1:],
+                ]
+            )
+            self.dino_transform = dino_transform
+        else:
+            raise ValueError(f"Image Resize Strategy `{self.image_resize_strategy}` is not supported!")
+    def get_fsdp_wrapping_policy(self) -> Callable:
+        """Return a simple FSDP policy that wraps each ViT block and then both of the _entire_ featurizers."""
+        vit_wrap_policy = partial(_module_wrap_policy, module_classes={VisionTransformer})
+        transformer_block_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block})
+        return partial(_or_policy, policies=[vit_wrap_policy, transformer_block_policy])
+    def forward(self, pixel_values, device="cpu", input_dtype_new=None) -> torch.Tensor:
+        """Runs the transformed image/pixel tensors through each vision backbone, returning concatenated patches."""
+        # b, c , h , w : 0-1
+        t_tensors = []
+        for pixel_value in pixel_values:
+            t_tensors.append(self.dino_transform(pixel_value).unsqueeze(0))
+        t_tensors = torch.cat(t_tensors, dim=0).to(device)
+        if input_dtype_new is not None:
+            t_tensors = t_tensors.to(input_dtype_new)
+        t_tensors_list = self.dino_featurizer(t_tensors)
+        return t_tensors_list
+    @property
+    def default_image_resolution(self) -> Tuple[int, int, int]:
+        return self.dino_data_cfg["input_size"]
+    @property
+    def embed_dim(self) -> int:
+        return self.dino_featurizer.embed_dim + self.siglip_featurizer.embed_dim
+    @property
+    def num_patches(self) -> int:
+        assert self.dino_featurizer.patch_embed.num_patches == self.siglip_featurizer.patch_embed.num_patches
+        return self.dino_featurizer.patch_embed.num_patches
+    @property
+    def half_precision_dtype(self) -> torch.dtype:
+        return torch.bfloat16
+class DinoEncoder(nn.Module):
+    def __init__(self, backbone_name_or_path, image_resize_strategy: str, default_image_size: int = 224, feature_index = 22) -> None:
+        super().__init__()
+        self.image_encoder = DinoViTBackbone(backbone_name_or_path, image_resize_strategy, default_image_size, feature_index)
+        self.to_pil = torchvision.transforms.ToPILImage()
+    def forward(self, image_tensor, device="cpu", input_dtype_new=torch.float32): # input image size = 768
+        pixel_values = []
+        for image_tensor_i in image_tensor:
+            pixel_values.append(self.to_pil(image_tensor_i))
+        embeddings_dino_list = self.image_encoder(pixel_values, device, input_dtype_new)
+        if len(embeddings_dino_list) == 1:
+            embeddings_dino_list = embeddings_dino_list[0]
+        return embeddings_dino_list
+class DinoEncoderV2(nn.Module):
+    def __init__(self, backbone_name_or_path, image_resize_strategy: str, default_image_size: int = 224, feature_index = 22) -> None:
+        super().__init__()
+        self.image_encoder = DinoViTBackbone(backbone_name_or_path, image_resize_strategy, default_image_size, feature_index)
+        self.to_pil = torchvision.transforms.ToPILImage()
+    def get_fsdp_wrapping_policy(self):
+        return self.image_encoder.get_fsdp_wrapping_policy()
+    def forward(self, image_tensor_dict, device="cpu", input_dtype_new=torch.float32):
+        image_tensor = image_tensor_dict["images_ref"]
+        output_dict = {}
+        pixel_values = []
+        for image_tensor_i in image_tensor:
+            pixel_values.append(self.to_pil(image_tensor_i))
+        embeddings_dino_list = self.image_encoder(pixel_values, device, input_dtype_new)
+        if len(embeddings_dino_list) == 1:
+            embeddings_dino_list = embeddings_dino_list[0]
+        output_dict["img_patch_features"] = embeddings_dino_list
+        return output_dict
+class DinoEncoderV2_Canny(nn.Module):
+    def __init__(self, backbone_name_or_path, image_resize_strategy: str, default_image_size: int = 224, feature_index = 22) -> None:
+        super().__init__()
+        self.image_encoder = DinoViTBackbone(backbone_name_or_path, image_resize_strategy, default_image_size, feature_index)
+        self.to_pil = torchvision.transforms.ToPILImage()
+    def get_fsdp_wrapping_policy(self):
+        return self.image_encoder.get_fsdp_wrapping_policy()
+    def forward(self, image_tensor_dict, device="cpu", input_dtype_new=torch.float32):
+        image_canny = image_tensor_dict["images_canny"]
+        output_dict = {}
+        pixel_values = []
+        for image_tensor_i in image_canny:
+            pixel_values.append(self.to_pil(image_tensor_i))
+        embeddings_dino_list = self.image_encoder(pixel_values, device, input_dtype_new)
+        if len(embeddings_dino_list) == 1:
+            embeddings_dino_list = embeddings_dino_list[0]
+        output_dict["img_patch_features"] = embeddings_dino_list
+        return output_dict

models/image_encoder_siglipdino_shallowdeep.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torchvision
+import torch.nn as nn
+from einops import rearrange
+from models.sigclip import SigLIPViTBackbone
+from models.dino import DinoViTBackbone
+class ShallowDeepSiglipDinoEncoder(nn.Module):
+    def __init__(self, siglip_config={}, dino_config={}):
+        super().__init__()
+        self.to_pil = torchvision.transforms.ToPILImage()
+        self.image_encoder_siglip = SigLIPViTBackbone(**siglip_config)
+        self.image_encoder_dino = DinoViTBackbone(**dino_config)
+    def forward(self, image_tensor, device="cpu"):
+        bs = image_tensor.size(0)
+        # tensor 转 PIL
+        pixel_values = []
+        for image_tensor_i in image_tensor:
+            pixel_values.append(self.to_pil(image_tensor_i))
+        embeddings = []
+        embeddings_siglip_list = self.image_encoder_siglip(pixel_values, device)
+        embeddings_dino_list = self.image_encoder_dino(pixel_values, device)
+        for embeddings_siglip_i, embeddings_dino_i in zip(embeddings_siglip_list, embeddings_dino_list):
+            embeddings_i = torch.cat([embeddings_siglip_i, embeddings_dino_i], dim=-1) # channel concat
+            embeddings.append(embeddings_i)
+        return embeddings
+# The default is to use double the image size, i.e., 768x768.
+class ShallowDeepPatchfySiglipDinoEncoder(nn.Module):
+    def __init__(self, siglip_config={}, dino_config={}, patchfy_scale=2, default_image_size=384):
+        super().__init__()
+        self.to_pil = torchvision.transforms.ToPILImage()
+        self.image_encoder_siglip = SigLIPViTBackbone(**siglip_config)
+        self.image_encoder_dino = DinoViTBackbone(**dino_config)
+        self.patchfy = (patchfy_scale > 1)
+        self.patchfy_scale = patchfy_scale
+        self.default_image_size = default_image_size
+    def forward(self, image_tensor, device="cpu", **kwargs): # input image size = 768
+        image_tensor = image_tensor["image_ref"] # this is a dict
+        bs = image_tensor.size(0)
+        if self.patchfy:
+            image_local = rearrange(image_tensor, "b c (h hp) (w wp) -> (b hp wp) c h w", hp=self.patchfy_scale, wp=self.patchfy_scale)
+            image_global = torch.nn.functional.interpolate(image_tensor, size=(self.default_image_size, self.default_image_size), mode='bilinear', align_corners=True)
+            # tensor 转 PIL
+            pixel_values_local, pixel_values_global = [], []
+            for image_tensor_i in image_local:
+                pixel_values_local.append(self.to_pil(image_tensor_i.to(torch.float)))
+            for image_tensor_i in image_global:
+                pixel_values_global.append(self.to_pil(image_tensor_i.to(torch.float)))
+            embeddings = []
+            embeddings_siglip_list = self.image_encoder_siglip(pixel_values_global, device)
+            embeddings_dino_list = self.image_encoder_dino(pixel_values_global, device)
+            for embeddings_siglip_i, embeddings_dino_i in zip(embeddings_siglip_list, embeddings_dino_list):
+                embeddings_i = torch.cat([embeddings_siglip_i, embeddings_dino_i], dim=-1) # channel concat
+                embeddings.append(embeddings_i)
+            embeddings_local_siglip_deep = self.image_encoder_siglip(pixel_values_local, device)[-1]
+            embeddings_local_dino_deep = self.image_encoder_dino(pixel_values_local, device)[-1]
+            embeddings_local_deep = torch.cat([embeddings_local_siglip_deep, embeddings_local_dino_deep], dim=-1)
+            embeddings_local_deep = rearrange(embeddings_local_deep, "(b hp wp) l c -> b (l hp wp) c", hp=self.patchfy_scale, wp=self.patchfy_scale)
+            embeddings.append(embeddings_local_deep)
+        else:
+            # tensor 转 PIL
+            pixel_values = []
+            for image_tensor_i in image_tensor:
+                pixel_values.append(self.to_pil(image_tensor_i))
+            embeddings = []
+            embeddings_siglip_list = self.image_encoder_siglip(pixel_values, device)
+            embeddings_dino_list = self.image_encoder_dino(pixel_values, device)
+            for embeddings_siglip_i, embeddings_dino_i in zip(embeddings_siglip_list, embeddings_dino_list):
+                # 逐层concat的方式
+                embeddings_i = torch.cat([embeddings_siglip_i, embeddings_dino_i], dim=-1) # channel concat
+                embeddings.append(embeddings_i)
+        if len(embeddings) == 1:
+            embeddings = embeddings[0]
+        return embeddings
+class ShallowDeepPatchfySiglipDinoEncoder_v2(nn.Module):
+    def __init__(self, siglip_config={}, dino_config={}, patchfy_scale=2, default_image_size=384):
+        super().__init__()
+        self.to_pil = torchvision.transforms.ToPILImage()
+        self.image_encoder_siglip = SigLIPViTBackbone(**siglip_config)
+        self.image_encoder_dino = DinoViTBackbone(**dino_config)
+        self.patchfy = (patchfy_scale > 1)
+        self.patchfy_scale = patchfy_scale
+        self.default_image_size = default_image_size
+    def forward(self, image_tensor_dict, device="cpu", **kwargs): # input image size = 768
+        image_tensor = image_tensor_dict["image_ref"]
+        bs = image_tensor.size(0)
+        if self.patchfy:
+            image_local = rearrange(image_tensor, "b c (h hp) (w wp) -> (b hp wp) c h w", hp=self.patchfy_scale, wp=self.patchfy_scale)
+            image_global = torch.nn.functional.interpolate(image_tensor, size=(self.default_image_size, self.default_image_size), mode='bilinear', align_corners=True)
+            pixel_values_local, pixel_values_global = [], []
+            for image_tensor_i in image_local:
+                pixel_values_local.append(self.to_pil(image_tensor_i.to(torch.float32)))
+            for image_tensor_i in image_global:
+                pixel_values_global.append(self.to_pil(image_tensor_i.to(torch.float32)))
+            embeddings = []
+            embeddings_siglip_list = self.image_encoder_siglip(pixel_values_global, device)
+            embeddings_dino_list = self.image_encoder_dino(pixel_values_global, device)
+            for embeddings_siglip_i, embeddings_dino_i in zip(embeddings_siglip_list, embeddings_dino_list):
+                embeddings_i = torch.cat([embeddings_siglip_i, embeddings_dino_i], dim=-1) # channel concat
+                embeddings.append(embeddings_i)
+            embeddings_local_siglip_deep = self.image_encoder_siglip(pixel_values_local, device)[-1]
+            embeddings_local_dino_deep = self.image_encoder_dino(pixel_values_local, device)[-1]
+            embeddings_local_deep = torch.cat([embeddings_local_siglip_deep, embeddings_local_dino_deep], dim=-1)
+            embeddings_local_deep = rearrange(embeddings_local_deep, "(b hp wp) l c -> b (l hp wp) c", hp=self.patchfy_scale, wp=self.patchfy_scale)
+            embeddings.append(embeddings_local_deep)
+        else:
+            # tensor 转 PIL
+            pixel_values = []
+            for image_tensor_i in image_tensor:
+                pixel_values.append(self.to_pil(image_tensor_i))
+            embeddings = []
+            embeddings_siglip_list = self.image_encoder_siglip(pixel_values, device)
+            embeddings_dino_list = self.image_encoder_dino(pixel_values, device)
+            for embeddings_siglip_i, embeddings_dino_i in zip(embeddings_siglip_list, embeddings_dino_list):
+                embeddings_i = torch.cat([embeddings_siglip_i, embeddings_dino_i], dim=-1) # channel concat
+                embeddings.append(embeddings_i)
+        if len(embeddings) == 1:
+            embeddings = embeddings[0]
+        return embeddings

models/projectors.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class MinAttention(nn.Module):
+    def __init__(self, q_dim: int, kv_dim: int, dim_head=64, heads=8):
+        super().__init__()
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(q_dim)
+        self.norm2 = nn.LayerNorm(kv_dim)
+        self.to_q = nn.Linear(q_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(kv_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(kv_dim, inner_dim, bias=False)
+    def forward(self, local_fea, global_fea):
+        global_fea = self.norm1(global_fea)
+        local_fea = self.norm2(local_fea)
+        b, l, _ = global_fea.shape
+        q = self.to_q(global_fea)
+        k = self.to_k(local_fea)
+        v = self.to_v(local_fea)
+        q = q.view(b, -1, self.heads, self.dim_head).transpose(1, 2)
+        k = k.view(b, -1, self.heads, self.dim_head).transpose(1, 2)
+        v = v.view(b, -1, self.heads, self.dim_head).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            q,k,v, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(b, -1, self.heads*self.dim_head)
+        hidden_states = hidden_states.to(q.dtype)
+        return hidden_states
+class CustomParameter(nn.Module):
+    def __init__(self, init_value):
+        super().__init__()
+        self.init_value = init_value
+        self.value = nn.Parameter(torch.tensor(init_value))
+    def forward(self):
+        return self.value
+class ProjectorHighResMinAttn(nn.Module):
+    def __init__(self, vision_dim, out_dim, dim_head=64, adaptive_scale=False, scale_value=1.0, **kwargs):
+        super().__init__()
+        self.initial_projection_dim = vision_dim * 4
+        heads = vision_dim // dim_head
+        self.min_attention = MinAttention(q_dim=vision_dim, kv_dim=vision_dim, dim_head=dim_head, heads=heads)
+        self.projector = nn.Sequential(
+            nn.Linear(vision_dim, self.initial_projection_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(self.initial_projection_dim, out_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim, bias=True),
+            nn.LayerNorm(out_dim)
+        )
+        self.projector_base = nn.Linear(vision_dim, out_dim, bias=True)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            self.scale_value = CustomParameter(scale_value)
+    def forward(self, vision_input_dict, time_emb=None, **kwargs):
+        """
+        vision_input_dict: here, this is not a dict, just for the unity of naming
+        """
+        img_patch_features = vision_input_dict
+        deep_features, deep_features_local = img_patch_features
+        fused_img_features = self.min_attention(deep_features_local, deep_features)
+        fused_img_features = self.projector(fused_img_features)
+        deep_img_features = self.projector_base(deep_features)
+        if self.adaptive_scale:
+            output = deep_img_features + fused_img_features * self.scale_value()
+        else:
+            output = deep_img_features + fused_img_features
+        return output
+class ProjectorHighResShallowMinAttnV1(nn.Module):
+    def __init__(self, vision_dim, out_dim, dim_head=64, **kwargs):
+        super().__init__()
+        self.initial_projection_dim = vision_dim * 4
+        heads = vision_dim // dim_head
+        self.min_attention = MinAttention(q_dim=vision_dim, kv_dim=vision_dim, dim_head=dim_head, heads=heads)
+        self.projector = nn.Sequential(
+            nn.Linear(vision_dim, self.initial_projection_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(self.initial_projection_dim, out_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim, bias=True),
+            nn.LayerNorm(out_dim)
+        )
+        self.projector_base = nn.Linear(vision_dim, out_dim, bias=True)
+        self.min_attention2 = MinAttention(q_dim=vision_dim, kv_dim=vision_dim, dim_head=dim_head, heads=heads)
+        self.projector2 = nn.Sequential(
+            nn.Linear(vision_dim, self.initial_projection_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(self.initial_projection_dim, out_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim, bias=True),
+            nn.LayerNorm(out_dim)
+        )
+    def forward(self, vision_input_dict, time_emb=None, **kwargs):
+        """
+        vision_input_dict: here, this is not a dict, just for the unity of naming
+        """
+        img_patch_features = vision_input_dict
+        shallow_features1, shallow_features2, shallow_features3, deep_features, deep_features_local = img_patch_features
+        shallow_features = torch.cat([shallow_features1, shallow_features2, shallow_features3], dim=1) # token concat
+        # original code
+        fused_img_features = self.min_attention(deep_features_local, deep_features)
+        fused_img_features = self.projector(fused_img_features)
+        deep_img_features = self.projector_base(deep_features)
+        output = deep_img_features + fused_img_features
+        # new code part
+        fused_img_features2 = self.min_attention2(shallow_features, deep_features)
+        fused_img_features2 = self.projector2(fused_img_features2)
+        output = torch.cat([deep_img_features, fused_img_features2], dim=1)
+        return output

models/sigclip.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+dinosiglip_vit.py
+Vision backbone that returns concatenated features from both DINOv2 and SigLIP.
+"""
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, Dict, Tuple
+import os
+import timm
+import torch
+from PIL import Image
+from timm.models.vision_transformer import Block, VisionTransformer
+from torch.distributed.fsdp.wrap import _module_wrap_policy, _or_policy, transformer_auto_wrap_policy
+from torchvision.transforms import Compose, Resize
+from models.base_vision import ImageTransform, LetterboxPad, VisionBackbone, unpack_tuple, return_tuple
+import torchvision
+import torch.nn as nn
+@dataclass
+class DinoSigLIPImageTransform:
+    dino_image_transform: ImageTransform
+    siglip_image_transform: ImageTransform
+    is_cobra: bool = True
+    def __call__(self, img: Image, **kwargs: str) -> Dict[str, torch.Tensor]:
+        return {"dino": self.dino_image_transform(img, **kwargs).unsqueeze(0), "siglip": self.siglip_image_transform(img, **kwargs).unsqueeze(0)}
+class SigLIPViTBackbone(VisionBackbone):
+    def __init__(self, backbone_name_or_path: str, image_resize_strategy: str, default_image_size: int = 224, last_n = 2, feature_index = 25) -> None:
+        super().__init__(backbone_name_or_path, image_resize_strategy, default_image_size=default_image_size)
+        # load from local paths
+        sigclip_pretrained_cfg = timm.models.create_model(backbone_name_or_path).default_cfg
+        sigclip_pretrained_cfg['file'] = 'ckpts/vit_so400m_patch14_siglip_384/open_clip_pytorch_model.bin'
+        # Initialize both Featurizers (ViTs) by downloading from HF / TIMM Hub if necessary
+        self.siglip_featurizer: VisionTransformer = timm.create_model(
+            backbone_name_or_path, pretrained=True, num_classes=0, img_size=self.default_image_size,
+            pretrained_cfg=sigclip_pretrained_cfg
+        )
+        self.siglip_featurizer.eval()
+        # Monkey-Patch the `forward()` function of the featurizers to ensure FSDP-compatibility
+        #   => Note: By default set `get_intermediate_layers` to return the *SECOND-TO-LAST* layer patches!
+        #   => TODO (siddk) Remove after resolution of https://github.com/pytorch/pytorch/issues/109385
+        # return the output tokens from the `n` last blocks
+        print("siglip has {} layer intermediate features. ".format(len(self.siglip_featurizer.blocks))) # 27
+        # self.siglip_featurizer.forward = unpack_tuple(
+        #     partial(self.siglip_featurizer.get_intermediate_layers, n={len(self.siglip_featurizer.blocks) - last_n})
+        # )
+        if isinstance(feature_index, tuple) or isinstance(feature_index, list):
+            feature_index = set(feature_index)
+        else:
+            feature_index = {feature_index}
+        self.siglip_featurizer.forward = return_tuple(
+            partial(self.siglip_featurizer.get_intermediate_layers, n=feature_index)
+        )
+        # Get Configs for _both_ Featurizers =>> Note :: Override default image size for larger resolution models
+        self.siglip_data_cfg = timm.data.resolve_model_data_config(self.siglip_featurizer)
+        self.siglip_data_cfg["input_size"] = (3, self.default_image_size, self.default_image_size)
+        # Initialize *both* Transforms
+        default_siglip_transform = timm.data.create_transform(**self.siglip_data_cfg, is_training=False)
+        # Fix =>> SigLIP default transform resizes to *larger* than `self.default_image_size` (crops image)!!
+        assert isinstance(default_siglip_transform, Compose), "Unexpected `default_image_transform`!"
+        assert isinstance(sl_resize_transform := default_siglip_transform.transforms[0], Resize)
+        default_siglip_transform = Compose(
+            [
+                Resize(self.default_image_size, interpolation=sl_resize_transform.interpolation),
+                *default_siglip_transform.transforms[1:],
+            ]
+        )
+        if self.image_resize_strategy == "resize-naive":
+            assert isinstance(default_siglip_transform, Compose), "Unexpected `default_siglip_image_transform`!"
+            assert isinstance(siglip_resize_transform := default_siglip_transform.transforms[0], Resize)
+            target_size = (self.default_image_size, self.default_image_size)
+            siglip_transform = Compose(
+                [
+                    Resize(target_size, interpolation=siglip_resize_transform.interpolation),
+                    *default_siglip_transform.transforms[1:],
+                ]
+            )
+            self.siglip_transform = siglip_transform
+        else:
+            raise ValueError(f"Image Resize Strategy `{self.image_resize_strategy}` is not supported!")
+    def get_fsdp_wrapping_policy(self) -> Callable:
+        """Return a simple FSDP policy that wraps each ViT block and then both of the _entire_ featurizers."""
+        vit_wrap_policy = partial(_module_wrap_policy, module_classes={VisionTransformer})
+        transformer_block_policy = partial(transformer_auto_wrap_policy, transformer_layer_cls={Block})
+        return partial(_or_policy, policies=[vit_wrap_policy, transformer_block_policy])
+    def forward(self, pixel_values, device="cpu") -> torch.Tensor:
+        """Runs the transformed image/pixel tensors through each vision backbone, returning concatenated patches."""
+        # b, c , h , w : 0-1
+        t_tensors = []
+        for pixel_value in pixel_values:
+            t_tensors.append(self.siglip_transform(pixel_value).unsqueeze(0))
+        t_tensors = torch.cat(t_tensors, dim=0).to(device)
+        t_tensors_list = self.siglip_featurizer(t_tensors)
+        return t_tensors_list
+    @property
+    def default_image_resolution(self) -> Tuple[int, int, int]:
+        return self.dino_data_cfg["input_size"]
+    @property
+    def embed_dim(self) -> int:
+        return self.dino_featurizer.embed_dim + self.siglip_featurizer.embed_dim
+    @property
+    def num_patches(self) -> int:
+        assert self.dino_featurizer.patch_embed.num_patches == self.siglip_featurizer.patch_embed.num_patches
+        return self.dino_featurizer.patch_embed.num_patches
+    @property
+    def half_precision_dtype(self) -> torch.dtype:
+        return torch.bfloat16
+class SigLIPEncoder(nn.Module):
+    def __init__(self, backbone_name_or_path: str, image_resize_strategy: str, default_image_size: int = 224, feature_index = 25):
+        super().__init__()
+        self.image_encoder = SigLIPViTBackbone(backbone_name_or_path, image_resize_strategy, default_image_size, feature_index)
+        self.to_pil = torchvision.transforms.ToPILImage()
+    def forward(self, image_tensor, device="cpu"): # input image size = 768
+        pixel_values = []
+        for image_tensor_i in image_tensor:
+            pixel_values.append(self.to_pil(image_tensor_i))
+        embeddings_dino_list = self.image_encoder(pixel_values, device)
+        if len(embeddings_dino_list) == 1:
+            embeddings_dino_list = embeddings_dino_list[0]
+        return embeddings_dino_list

models/text.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from dataclasses import dataclass
+from torch import nn
+from transformers import AutoTokenizer, CLIPTokenizerFast, CLIPTextModel, T5EncoderModel
+from typing import List
+@dataclass
+class TextModelOutput:
+    embeddings: torch.Tensor
+    masks: torch.Tensor
+    pooled: List
+class TextModel(nn.Module):
+    available_modes = [
+        "last",                 # If present, use last layer.
+        "penultimate",          # If present, use penultimate layer.
+        "penultimate_nonorm",   # If present, use penultimate layer without final norm.
+        "token_cat",            # If present, concat in token dimension, default concat in channel dimension.
+        "pad0",                 # If present, use 0 padding, default use EOT padding.
+        "masked",               # If present, pass attention mask to encoder.
+    ]
+    def __init__(self, variant: List[str], mode: List[str]):
+        super().__init__()
+        self.mode = set(mode)
+        self.tokenizers = []
+        self.models = nn.ModuleList([])
+        for v in variant:
+            if "clip" in v.lower():
+                self.tokenizers.append(CLIPTokenizerFast.from_pretrained(v, model_max_length=77))
+                self.models.append(CLIPTextModel.from_pretrained(v))
+            elif "t5" in v.lower() or "ul2" in v.lower():
+                self.tokenizers.append(AutoTokenizer.from_pretrained(v, model_max_length=77))
+                self.models.append(T5EncoderModel.from_pretrained(v, torch_dtype=torch.bfloat16))
+            else:
+                raise NotImplementedError
+    def get_vaild_token_length(self, text): # Return the length of the BPE encoding of the text, excluding `<sos>` and `<eos>`.
+        lengths = []
+        for tokenizer, model in zip(self.tokenizers, self.models):
+            tokens = tokenizer(
+                text=text,
+                truncation=True,
+                padding="max_length",
+                return_tensors="pt"
+            ).to(model.device)
+            token_length = tokens["attention_mask"].sum() - 2 # In the attention mask, both the SOS and EOS (first PAD) have a value of 1.
+            lengths.append(token_length.item())
+        length = int(sum(lengths) / len(lengths))
+        return length
+    def forward(self, text: List[str]) -> TextModelOutput:
+        embeddings = []
+        masks = []
+        pooled = []
+        for tokenizer, model in zip(self.tokenizers, self.models):
+            tokens = tokenizer(
+                text=text,
+                truncation=True,
+                padding="max_length",
+                return_tensors="pt"
+            ).to(model.device)
+            if "pad0" in self.mode:
+                tokens.input_ids *= tokens.attention_mask
+            output = model(
+                input_ids=tokens.input_ids,
+                attention_mask=tokens.attention_mask if "masked" in self.mode else None,
+                output_hidden_states=True
+            )
+            if "last" in self.mode:
+                embeddings.append(output.last_hidden_state)
+            if "penultimate" in self.mode:
+                embeddings.append(model.text_model.final_layer_norm(output.hidden_states[-2]))
+            if "penultimate_nonorm" in self.mode:
+                embeddings.append(output.hidden_states[-2])
+            masks.append(tokens.attention_mask)
+            if hasattr(output, "pooler_output"):
+                pooled.append(output.pooler_output)
+        if "token_cat" in self.mode:
+            return TextModelOutput(
+                embeddings=torch.cat(embeddings, dim=1),
+                masks=torch.cat(masks, dim=1),
+                pooled=pooled
+            )
+        else:
+            return TextModelOutput(
+                embeddings=torch.cat(embeddings, dim=2),
+                masks=torch.stack(masks, dim=2).sum(2).clamp_max(1),
+                pooled=pooled
+            )

models/transformer_2d_custom.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate
+# from diffusers.models.attention import BasicTransformerBlock
+from models.attention_custom import BasicTransformerBlock
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.modeling_utils import ModelMixin
+from utils import update_dict
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+# Transformer2DModel
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        image_prompt_settings = {},
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+            )
+        # 3. Define transformers blocks, NOTE: we change the format
+        self.transformer_blocks = []
+        for d in range(num_layers):
+            self.transformer_blocks.append(
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    image_prompt_settings=image_prompt_settings,
+                )
+            )
+            image_prompt_settings["cross_attention_id"] += 1
+        self.transformer_blocks = nn.ModuleList(self.transformer_blocks)
+        # self.transformer_blocks = nn.ModuleList(
+        #     [
+        #         BasicTransformerBlock(
+        #             inner_dim,
+        #             num_attention_heads,
+        #             attention_head_dim,
+        #             dropout=dropout,
+        #             cross_attention_dim=cross_attention_dim,
+        #             activation_fn=activation_fn,
+        #             num_embeds_ada_norm=num_embeds_ada_norm,
+        #             attention_bias=attention_bias,
+        #             only_cross_attention=only_cross_attention,
+        #             upcast_attention=upcast_attention,
+        #             norm_type=norm_type,
+        #             norm_elementwise_affine=norm_elementwise_affine,
+        #             image_prompt_settings=image_prompt_settings,
+        #         )
+        #         for d in range(num_layers)
+        #     ]
+        # )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = nn.Linear(inner_dim, in_channels)
+            else:
+                self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches:
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        encoder_hidden_states_vision = None,
+        encoder_hidden_states_control = None,
+        vision_guided_mask = None,
+        extra_dict_inputs = {},
+        return_self_attn_map = False,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # <notice>
+        extra_dict_outputs = {}
+        height, width = hidden_states.size(-2), hidden_states.size(-1)
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            hidden_states = self.pos_embed(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states, extra_dict_output_transformer = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+                encoder_hidden_states_vision=encoder_hidden_states_vision,
+                encoder_hidden_states_control=encoder_hidden_states_control,
+                vision_guided_mask=vision_guided_mask,
+                extra_dict_inputs=extra_dict_inputs,
+                height=height,
+                width=width,
+                return_self_attn_map=return_self_attn_map
+            )
+            extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_transformer)
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        elif self.is_input_patches:
+            # TODO: cleanup!
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict: # return_dict=False
+            return output, extra_dict_outputs
+            # return (output,)
+        # return Transformer2DModelOutput(sample=output)
+        return Transformer2DModelOutput(sample=output)[0], extra_dict_outputs

models/unet_2d_blocks_custom.py ADDED Viewed

The diff for this file is too large to render. See raw diff

models/unet_2d_condition_custom.py ADDED Viewed

	@@ -0,0 +1,1059 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from einops import rearrange
+from diffusers.models.modeling_utils import ModelMixin
+# from diffusers.models.unet_2d_blocks import (
+from models.unet_2d_blocks_custom import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from utils import update_dict
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:her
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        image_prompt_settings = {"dualbranch_mode": "none"},
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # NOTE: we need to mark each cross attention id
+        image_prompt_settings["cross_attention_id"] = 0
+        # down
+        output_channel = block_out_channels[0]
+        # XL: ['DownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D']
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                image_prompt_settings=image_prompt_settings,
+            )
+            self.down_blocks.append(down_block)
+        # mid, XL: UNetMidBlock2DCrossAttn
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                image_prompt_settings=image_prompt_settings,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up, XL: ['CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'UpBlock2D']
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                image_prompt_settings=image_prompt_settings
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+        # NOTE: settings for IP-consistent generation
+        from utils import instantiate_from_config
+        self.vision_projection_type = image_prompt_settings.get("vision_projection_type", "none")
+        self.cross_attention_dim = cross_attention_dim[0]
+        if self.vision_projection_type != "none":
+            self.encoder_hidden_states_vision_projection = instantiate_from_config(image_prompt_settings["vision_projection_config"])
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        vision_input_dict = None,
+        vision_guided_mask: Optional[torch.Tensor] = None,
+        return_text2image_mask: bool = False,
+        return_as_origin: bool = True,
+        return_self_attn_map = False,
+        multiple_reference_image = False,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        extra_dict_outputs = {}
+        extra_dict_inputs = {}
+        extra_dict_inputs["multiple_reference_image"] = multiple_reference_image
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # NOTE: image condition
+        encoder_hidden_states_vision = None
+        if vision_input_dict is not None and self.vision_projection_type != "none":
+            if multiple_reference_image:
+                encoder_hidden_states_vision = []
+                for encoder_hidden_states_vision_i in encoder_hidden_states_vision:
+                    encoder_hidden_states_vision_i = self.encoder_hidden_states_vision_projection(vision_input_dict=vision_input_dict, time_emb=emb, image_latent=sample)
+                    encoder_hidden_states_vision.append(encoder_hidden_states_vision_i)
+            else:
+                encoder_hidden_states_vision = self.encoder_hidden_states_vision_projection(vision_input_dict=vision_input_dict, time_emb=emb, image_latent=sample)
+            if type(encoder_hidden_states_vision) == dict:
+                if "l_disen" in encoder_hidden_states_vision.keys():
+                    extra_dict_outputs["l_disen"] = encoder_hidden_states_vision["l_disen"]
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 3. down
+        down_block_res_samples = (sample,)
+        additional_residuals = {}
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples, extra_dict_output_down = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_vision=encoder_hidden_states_vision,
+                    encoder_hidden_states_control=None,
+                    vision_guided_mask=vision_guided_mask,
+                    extra_dict_inputs=extra_dict_inputs,
+                    return_self_attn_map=return_self_attn_map,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples, extra_dict_output_down = downsample_block(hidden_states=sample, temb=emb)
+            extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_down)
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample, extra_dict_output_middel = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                encoder_hidden_states_vision=encoder_hidden_states_vision,
+                encoder_hidden_states_control=None,
+                vision_guided_mask=vision_guided_mask,
+                extra_dict_inputs=extra_dict_inputs,
+                return_self_attn_map=return_self_attn_map,
+            )
+            extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_middel)
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample, extra_dict_output_up = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_vision=encoder_hidden_states_vision,
+                    encoder_hidden_states_control=None,
+                    vision_guided_mask=vision_guided_mask,
+                    extra_dict_inputs=extra_dict_inputs,
+                    return_self_attn_map=return_self_attn_map
+                )
+            else:
+                sample, extra_dict_output_up = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+            extra_dict_outputs = update_dict(extra_dict_outputs, extra_dict_output_up)
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        if return_as_origin:
+            return UNet2DConditionOutput(sample=sample)
+        else:
+            if return_text2image_mask:
+                return UNet2DConditionOutput(sample=sample).sample, extra_dict_outputs
+            else:
+                return UNet2DConditionOutput(sample=sample).sample

models/vae.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import diffusers
+import torch
+class AutoencoderKL(diffusers.AutoencoderKL):
+    """
+    We simply inherit the model code from diffusers
+    """
+    def __init__(self, attention=True, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # A hacky way to remove attention.
+        if not attention:
+            self.encoder.mid_block.attentions = torch.nn.ModuleList([None])
+            self.decoder.mid_block.attentions = torch.nn.ModuleList([None])
+    def load_state_dict(self, state_dict, strict=True):
+        # Newer version of diffusers changed the model keys, causing incompatibility with old checkpoints.
+        # They provided a method for conversion. We call conversion before loading state_dict.
+        convert_deprecated_attention_blocks = getattr(self, "_convert_deprecated_attention_blocks", None)
+        if callable(convert_deprecated_attention_blocks):
+            convert_deprecated_attention_blocks(state_dict)
+        return super().load_state_dict(state_dict, strict)

prompts/validation_negative.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ (((naked))), deformed, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, malformed hands, blurry, ((((mutated hands and fingers)))), watermark, watermarked, oversaturated, censored, distorted hands, amputation, missing hands, obese, doubled face, double hands

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+numpy
+ftfy
+# Training
+bs4==0.0.1              # Needed for text cleaning
+bson==0.5.10
+diffusers==0.19.3       # diffusers[torch]==0.19.3 in control
+einops==0.6.0
+ftfy==6.1.1             # Needed for text cleaning
+kornia==0.6.12
+lpips==0.1.4
+sentencepiece==0.1.99   # Needed for T5 tokenizer
+transformers==4.36.2
+tqdm==4.64.1
+torchgeometry           # Needed for ssim loss
+expecttest              # Needed for compile
+accelerate==0.24.1      # model saving bugs when accelerate==0.25.0
+# Inference
+av==10.0.0
+pims==0.6.1
+opencv-python-headless==4.6.0.66
+gradio==3.42.0
+httpx==0.23.3
+opencv-python
+open_clip_torch
+protobuf==3.20.0
+huggingface_hub==0.25.0
+open_clip_torch
+git+https://github.com/openai/CLIP.git

schedulers/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (3.63 kB). View file

schedulers/__pycache__/ddim.cpython-310.pyc ADDED Viewed

Binary file (1.98 kB). View file

schedulers/__pycache__/dpm_s.cpython-310.pyc ADDED Viewed

Binary file (6.09 kB). View file

schedulers/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.8 kB). View file

schedulers/base.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from dataclasses import dataclass
+from abc import ABC
+from typing import Optional, Union, List
+@dataclass
+class SchedulerConversionOutput:
+    pred_epsilon: torch.Tensor
+    pred_original_sample: torch.Tensor
+    pred_velocity: torch.Tensor
+@dataclass
+class SchedulerStepOutput:
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+class Scheduler(ABC):
+    prediction_types = ["epsilon", "sample", "v_prediction"]
+    timesteps_types = ["leading", "linspace", "trailing"]
+    def __init__(
+        self,
+        num_train_timesteps: int,
+        num_inference_timesteps: int,
+        betas: torch.Tensor,
+        inference_timesteps: Union[str, List[int]] = "trailing",
+        set_alpha_to_one: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: torch.dtype = torch.float32
+    ):
+        assert num_train_timesteps > 0
+        assert num_train_timesteps >= num_inference_timesteps
+        assert num_train_timesteps == betas.size(0)
+        assert betas.ndim == 1
+        self.device = device or betas.device
+        self.dtype = dtype
+        self.num_train_timesteps = num_train_timesteps
+        self.num_inference_timesteps = num_inference_timesteps
+        self.betas = betas.to(device=device, dtype=dtype)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.final_alpha_cumprod = torch.tensor(1.0, device=self.device, dtype=self.dtype) if set_alpha_to_one else self.alphas_cumprod[0]
+        if isinstance(inference_timesteps, list):
+            # If user defines a custom inference timestep, directly assign it.
+            assert len(inference_timesteps) == num_inference_timesteps
+            self.timesteps = torch.tensor(inference_timesteps, device=self.device, dtype=torch.int)
+        elif inference_timesteps == "trailing":
+            # Example 20 steps: [999, 949, 899, 849, 799, 749, 699, 649, 599, 549, 499, 449, 399, 349, 299, 249, 199, 149,  99,  49]
+            self.timesteps = torch.arange(num_train_timesteps - 1, -1, -num_train_timesteps / num_inference_timesteps, device=self.device).round().int()
+        elif inference_timesteps == "linspace":
+            # Example 20 steps: [999, 946, 894, 841, 789, 736, 684, 631, 578, 526, 473, 421, 368, 315, 263, 210, 158, 105,  53,   0]
+            self.timesteps = torch.linspace(0, num_train_timesteps - 1, num_inference_timesteps, device=self.device).round().int().flip(0)
+        elif inference_timesteps == "leading":
+            # Original SD and DDIM paper may have a bug: <https://github.com/huggingface/diffusers/issues/2585>
+            # The inference timestep does not start from 999.
+            # Example 20 steps: [950, 900, 850, 800, 750, 700, 650, 600, 550, 500, 450, 400, 350, 300, 250, 200, 150, 100,  50,   0]
+            self.timesteps = torch.arange(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps, device=self.device, dtype=torch.int).flip(0)
+        else:
+            raise NotImplementedError
+    def reset(self):
+        pass
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: Union[torch.Tensor, int],
+    ) -> torch.Tensor:
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (original_samples.ndim - 1)))
+        return alpha_prod_t ** (0.5) * original_samples + (1 - alpha_prod_t) ** (0.5) * noise
+    def convert_output(
+        self,
+        model_output: torch.Tensor,
+        model_output_type: str,
+        sample: torch.Tensor,
+        timesteps: Union[torch.Tensor, int]
+    ) -> SchedulerConversionOutput:
+        assert model_output_type in self.prediction_types
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (sample.ndim - 1)))
+        beta_prod_t = 1 - alpha_prod_t
+        if model_output_type == "epsilon":
+            pred_epsilon = model_output
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * pred_epsilon) / alpha_prod_t ** (0.5)
+            pred_velocity = alpha_prod_t ** (0.5) * pred_epsilon - (1 - alpha_prod_t) ** (0.5) * pred_original_sample
+        elif model_output_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+            pred_velocity = alpha_prod_t ** (0.5) * pred_epsilon - (1 - alpha_prod_t) ** (0.5) * pred_original_sample
+        elif model_output_type == "v_prediction":
+            pred_velocity = model_output
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError("Unknown prediction type")
+        return SchedulerConversionOutput(
+            pred_epsilon=pred_epsilon,
+            pred_original_sample=pred_original_sample,
+            pred_velocity=pred_velocity)
+    def get_velocity(
+        self,
+        sample: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.FloatTensor:
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (sample.ndim - 1)))
+        return alpha_prod_t ** (0.5) * noise - (1 - alpha_prod_t) ** (0.5) * sample

schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import *
+class DDIMScheduler(Scheduler):
+    def step(
+        self,
+        model_output: torch.Tensor,
+        model_output_type: str,
+        timestep: Union[torch.Tensor, int],
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        clip_sample: bool = False,
+        dynamic_threshold: Optional[float] = None,
+        variance_noise: Optional[torch.Tensor] = None,
+    ) -> SchedulerStepOutput:
+        # 1. get previous step value (t-1)
+        if not isinstance(timestep, torch.Tensor):
+            timestep = torch.tensor(timestep, device=self.device, dtype=torch.int)
+        idx = timestep.reshape(-1, 1).eq(self.timesteps.reshape(1, -1)).nonzero()[:, 1]
+        prev_timestep = self.timesteps[idx.add(1).clamp_max(self.num_inference_timesteps - 1)]
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep].reshape(-1, *([1] * (sample.ndim - 1)))
+        alpha_prod_t_prev = torch.where(idx < self.num_inference_timesteps - 1, self.alphas_cumprod[prev_timestep], self.final_alpha_cumprod).reshape(-1, *([1] * (sample.ndim - 1)))
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. compute predicted original sample from predicted noise also called
+        model_output_conversion = self.convert_output(model_output, model_output_type, sample, timestep)
+        pred_original_sample = model_output_conversion.pred_original_sample
+        pred_epsilon = model_output_conversion.pred_epsilon
+        # 4. Clip or threshold "predicted x_0"
+        if clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+            pred_epsilon = self.convert_output(pred_original_sample, "sample", sample, timestep).pred_epsilon
+        if dynamic_threshold is not None:
+            # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+            dynamic_max_val = pred_original_sample \
+                .flatten(1) \
+                .abs() \
+                .float() \
+                .quantile(dynamic_threshold, dim=1) \
+                .type_as(pred_original_sample) \
+                .clamp_min(1) \
+                .view(-1, *([1] * (pred_original_sample.ndim - 1)))
+            pred_original_sample = pred_original_sample.clamp(-dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            pred_epsilon = self.convert_output(pred_original_sample, "sample", sample, timestep).pred_epsilon
+        # 5. compute variance: "sigma_t(η)" -> see formula (16) from https://arxiv.org/pdf/2010.02502.pdf
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        std_dev_t = eta * variance ** (0.5)
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+        # 8. add "random noise" if needed.
+        if eta > 0:
+            if variance_noise is None:
+                variance_noise = torch.randn_like(model_output)
+            prev_sample = prev_sample + std_dev_t * variance_noise
+        return SchedulerStepOutput(
+            prev_sample=prev_sample,
+            pred_original_sample=pred_original_sample)

schedulers/dpm_m.py ADDED Viewed

	@@ -0,0 +1,412 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from .base import *
+class DPMSolverMultistepScheduler(Scheduler):
+    def __init__(
+        self,
+        # Generic scheduler settings
+        num_inference_timesteps: int,
+        betas: torch.Tensor,
+        num_train_timesteps: int = 1000,
+        inference_timesteps: Union[str, List[str]] = "trailing",
+        set_alpha_to_one: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: torch.dtype = torch.float32,
+        # DPM scheduler settings
+        solver_order: int = 2,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        use_karras_sigmas: bool = False,
+    ):
+        super().__init__(
+            num_train_timesteps=num_train_timesteps,
+            num_inference_timesteps=num_inference_timesteps,
+            betas=betas,
+            inference_timesteps=inference_timesteps,
+            set_alpha_to_one=set_alpha_to_one,
+            device=device,
+            dtype=dtype,
+        )
+        self.solver_order = solver_order
+        self.solver_type = solver_type
+        self.lower_order_final = lower_order_final
+        self.algorithm_type = algorithm_type
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        sigmas = torch.sqrt((1 - self.alphas_cumprod) / self.alphas_cumprod)
+        if use_karras_sigmas:
+            log_sigmas = torch.log(sigmas)
+            sigmas = self._convert_to_karras(
+                in_sigmas=sigmas, num_inference_timesteps=num_inference_timesteps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas)
+                                 for sigma in sigmas]).round()
+            timesteps = np.flip(timesteps).copy().astype(np.int64)
+            self.timesteps = torch.from_numpy(timesteps).to(device)
+            sigmas = torch.from_numpy(sigmas).to(device)
+        self.sigmas = sigmas
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++", "deis"]:
+            raise NotImplementedError(
+                f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun", "logrho", "bh1", "bh2"]:
+            raise NotImplementedError(
+                f"{solver_type} does is not implemented for {self.__class__}")
+        # setable values
+        self.reset()
+    def reset(self):
+        self.model_outputs = [None] * self.solver_order
+        self.lower_order_nums = 0
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(sigma)
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(
+            axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_timesteps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_timesteps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPM-Solver (equivalent to DDIM).
+        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+        h = lambda_t - lambda_s
+        if self.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - \
+                (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - \
+                (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DPM-Solver.
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DPM-Solver.
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        model_output_type: str,
+        timestep: int,
+        sample: torch.FloatTensor,
+    ) -> SchedulerStepOutput:
+        """
+        Step function propagating the sample with the multistep DPM-Solver.
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+        Returns:
+            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        if self.num_inference_timesteps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero()
+        if len(step_index) == 0:
+            step_index = len(self.timesteps) - 1
+        else:
+            step_index = step_index.item()
+        prev_timestep = 0 if step_index == len(
+            self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) -
+             1) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) -
+             2) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        model_output_convert = self.convert_output(
+            model_output, model_output_type=model_output_type, sample=sample, timesteps=timestep)
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            model_output = model_output_convert.pred_original_sample
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            model_output = model_output_convert.pred_epsilon
+        for i in range(self.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+        if self.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = torch.randn_like(
+                model_output, device=model_output.device, dtype=model_output.dtype)
+        else:
+            noise = None
+        if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(
+                model_output, timestep, prev_timestep, sample, noise=noise
+            )
+        elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                self.model_outputs, timestep_list, prev_timestep, sample, noise=noise
+            )
+        else:
+            timestep_list = [self.timesteps[step_index - 2],
+                             self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
+        if self.lower_order_nums < self.solver_order:
+            self.lower_order_nums += 1
+        return SchedulerStepOutput(prev_sample=prev_sample, pred_original_sample=model_output_convert.pred_original_sample)

schedulers/dpm_s.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import *
+class DPMSolverSingleStepScheduler(Scheduler):
+    def __init__(
+        self,
+        # Generic scheduler settings
+        num_train_timesteps: int,
+        num_inference_timesteps: int,
+        betas: torch.Tensor,
+        inference_timesteps: Union[str, List[int]] = "trailing",
+        set_alpha_to_one: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: torch.dtype = torch.float32,
+        # DPM scheduler settings
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        solver_order: int = 2,
+        lower_order_final: bool = True,
+    ):
+        super().__init__(
+            num_train_timesteps=num_train_timesteps,
+            num_inference_timesteps=num_inference_timesteps,
+            betas=betas,
+            inference_timesteps=inference_timesteps,
+            set_alpha_to_one=set_alpha_to_one,
+            device=device,
+            dtype=dtype,
+        )
+        self.solver_order = solver_order
+        self.solver_type = solver_type
+        self.lower_order_final = lower_order_final
+        self.algorithm_type = algorithm_type
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.reset()
+    def reset(self):
+        self.model_outputs = [None] * self.solver_order
+        self.sample = None
+        self.order_list = self.get_order_list()
+        self.last_step_index = None
+    def get_order_list(self) -> List[int]:
+        steps = self.num_inference_timesteps
+        order = self.solver_order
+        # First step must be order 1
+        # Second step must be order 1 in case of terminal zero SNR
+        orders = [1] + [(i % order) + 1 for i in range(steps - 1)] + [1]
+        # Last step should be order 1 for better quality.
+        if self.lower_order_final:
+            orders[-1] = 1
+        return orders
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+        h = lambda_t - lambda_s
+        if self.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        return x_t
+    def singlestep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s1 = self.alpha_t[t], self.alpha_t[s1]
+        sigma_t, sigma_s1 = self.sigma_t[t], self.sigma_t[s1]
+        h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m1, (1.0 / r0) * (m0 - m1)
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+    def singlestep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        alpha_t, alpha_s2 = self.alpha_t[t], self.alpha_t[s2]
+        sigma_t, sigma_s2 = self.sigma_t[t], self.sigma_t[s2]
+        h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m2
+        D1_0, D1_1 = (1.0 / r1) * (m1 - m2), (1.0 / r0) * (m0 - m2)
+        D1 = (r0 * D1_0 - r1 * D1_1) / (r0 - r1)
+        D2 = 2.0 * (D1_1 - D1_0) / (r0 - r1)
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1_1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                    - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+                )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1_1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+                )
+        return x_t
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        model_output_type: str,
+        timestep: int,
+        sample: torch.FloatTensor,
+    ) -> SchedulerStepOutput:
+        step_index = (self.timesteps == timestep).nonzero().item()
+        # Check if this step is the follow-up of the previous step.
+        # If not, then we reset and treat it as a new run.
+        if self.last_step_index and self.last_step_index != step_index - 1:
+            self.reset()
+        self.last_step_index = step_index
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        model_output_convert = self.convert_output(model_output, model_output_type, sample, timestep)
+        if self.algorithm_type == "dpmsolver++":
+            model_output = model_output_convert.pred_original_sample
+        else:
+            model_output = model_output_convert.pred_epsilon
+        for i in range(self.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+        order = self.order_list[step_index]
+        #  For img2img denoising might start with order>1 which is not possible
+        #  In this case make sure that the first two steps are both order=1
+        while self.model_outputs[-order] is None:
+            order -= 1
+        # For single-step solvers, we use the initial value at each time with order = 1.
+        if order == 1:
+            self.sample = sample
+        timestep_list = [self.timesteps[step_index - i] for i in range(order - 1, 0, -1)] + [timestep]
+        if order == 1:
+            prev_sample = self.dpm_solver_first_order_update(self.model_outputs[-1], timestep_list[-1], prev_timestep, self.sample)
+        elif order == 2:
+            prev_sample = self.singlestep_dpm_solver_second_order_update(self.model_outputs, timestep_list, prev_timestep, self.sample)
+        elif order == 3:
+            prev_sample = self.singlestep_dpm_solver_third_order_update(self.model_outputs, timestep_list, prev_timestep, self.sample)
+        else:
+            raise NotImplementedError
+        return SchedulerStepOutput(
+            prev_sample=prev_sample,
+            pred_original_sample=model_output_convert.pred_original_sample
+        )

schedulers/utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List
+import torch
+def get_betas(name: str, num_steps: int = 1000, shift_snr: float = 1, terminal_pure_noise: bool = False):
+    # Get betas
+    max_beta = 1 if terminal_pure_noise else 0.999
+    if name == "squared_linear":
+        betas = torch.linspace(0.00085**0.5, 0.012**0.5, num_steps) ** 2
+    elif name == "cosine":
+        betas = get_cosine_betas(num_steps, max_beta=max_beta)
+    elif name == "alphas_cumprod_linear":
+        betas = get_alphas_cumprod_linear_betas(num_steps, max_beta=max_beta)
+    elif name == "sigmoid":
+        betas = get_sigmoid_betas(num_steps, max_beta=max_beta, square=True, slop=0.7)
+    else:
+        raise NotImplementedError
+    # Shift snr
+    betas = shift_betas_by_snr_factor(betas, shift_snr)
+    # Ensure terminal pure noise
+    # Only non-cosine schedule uses rescale, cosine schedule can directly set max_beta=1 to ensure temrinal pure noise.
+    if name == "squared_linear" and terminal_pure_noise:
+        betas = rescale_betas_to_ensure_terminal_pure_noise(betas)
+    return betas
+def validate_betas(betas: List[float]) -> bool:
+    """
+    Validate betas is monotonic and within 0 to 1 range, i.e. 0 < beta_{t-1} < beta_{t} <= 1
+    Args:
+        betas (List[float]): betas
+    Returns:
+        bool: True if betas is correct
+    """
+    return all(b1 < b2 for b1, b2 in zip(betas, betas[1:])) and betas[0] > 0 and betas[-1] <= 1
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar_fn, max_beta=0.999):
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    if not validate_betas(betas):
+        import logging
+        logging.warning("No feasible betas for given alpha bar")
+    return torch.tensor(betas, dtype=torch.float32)
+def get_cosine_betas(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+    def alpha_bar_fn(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+    return betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar_fn, max_beta)
+def get_sigmoid_betas(num_diffusion_timesteps, max_beta, square=False, slop=1):
+    def alpha_bar_fn(t):
+        def sigmoid(x):
+            return 1 / (1 + math.exp(-x * slop))
+        s = 6  # (-6, 6) from geodiff
+        vb = sigmoid(-s)
+        ve = sigmoid(s)
+        return ((sigmoid(s - t * 2 * s) - vb) / (ve - vb))**(1 if not square else 2)
+    return betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar_fn, max_beta)
+def get_alphas_cumprod_linear_betas(num_diffusion_timesteps, max_beta):
+    def alpha_bar_fn(t):
+        return 1 - t
+    return betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar_fn, max_beta=max_beta)
+def shift_betas_by_snr_factor(betas: torch.Tensor, factor: float) -> torch.Tensor:
+    if factor == 1.0:
+        return betas
+    # Convert betas to snr
+    alphas = 1 - betas
+    alphas_cumprod = alphas.cumprod(dim=0)
+    snr = alphas_cumprod / (1 - alphas_cumprod)
+    # Shift snr
+    snr *= factor
+    # Convert snr to betas
+    alphas_cumprod = snr / (1 + snr)
+    alphas = torch.cat(
+        [alphas_cumprod[0:1], alphas_cumprod[1:] / alphas_cumprod[:-1]])
+    betas = 1 - alphas
+    return betas
+def rescale_betas_to_ensure_terminal_pure_noise(betas: torch.Tensor) -> torch.Tensor:
+    # Convert betas to alphas_cumprod_sqrt
+    alphas = 1 - betas
+    alphas_cumprod = alphas.cumprod(0)
+    alphas_cumprod_sqrt = alphas_cumprod.sqrt()
+    # Rescale alphas_cumprod_sqrt such that alphas_cumprod_sqrt[0] remains unchanged but alphas_cumprod_sqrt[-1] = 0
+    alphas_cumprod_sqrt = (alphas_cumprod_sqrt - alphas_cumprod_sqrt[-1]) / (
+        alphas_cumprod_sqrt[0] - alphas_cumprod_sqrt[-1]) * alphas_cumprod_sqrt[0]
+    # Convert alphas_cumprod_sqrt to betas
+    alphas_cumprod = alphas_cumprod_sqrt ** 2
+    alphas = torch.cat(
+        [alphas_cumprod[0:1], alphas_cumprod[1:] / alphas_cumprod[:-1]])
+    betas = 1 - alphas
+    return betas

utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+import importlib
+def zero_module(module):
+    if isinstance(module, nn.Linear):
+        module.weight.data.zero_()
+        if module.bias is not None:
+            module.bias.data.zero_()
+    return module
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def update_dict(old_dict, new_dict):
+    old_keys = old_dict.keys()
+    for new_key in new_dict.keys():
+        if new_key in old_keys:
+            if type(old_dict[new_key]) == list:
+                if type(new_dict[new_key]) == list:
+                    old_dict[new_key].extend(new_dict[new_key])
+                else:
+                    old_dict[new_key].append(new_dict[new_key])
+            else:
+                old_dict[new_key] = [old_dict[new_key]]
+                old_dict[new_key].append(new_dict[new_key])
+        else:
+            old_dict[new_key] = new_dict[new_key]
+    return old_dict