Spaces:

FQiao
/

SoundingStreet

Running on Zero

App Files Files Community

FQiao commited on 5 days ago

Commit

3324de2

verified ·

1 Parent(s): 8f70a8a

Upload 70 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
DepthEstimator.py +67 -0
GenerateAudio.py +184 -0
GenerateCaptions.py +494 -0
README.md +50 -13
SoundMapper.py +438 -0
app.py +182 -0
audio_mixer.py +428 -0
config.py +16 -0
environment.yml +8 -0
external_models/TangoFlux/.gitignore +175 -0
external_models/TangoFlux/Demo.ipynb +117 -0
external_models/TangoFlux/Inference.ipynb +0 -0
external_models/TangoFlux/LICENSE.md +51 -0
external_models/TangoFlux/Notice +1 -0
external_models/TangoFlux/README.md +188 -0
external_models/TangoFlux/STABILITY_AI_COMMUNITY_LICENSE.md +57 -0
external_models/TangoFlux/__init__.py +4 -0
external_models/TangoFlux/assets/tangoflux.png +3 -0
external_models/TangoFlux/assets/tf_opener.png +3 -0
external_models/TangoFlux/assets/tf_teaser.png +3 -0
external_models/TangoFlux/comfyui/README.md +78 -0
external_models/TangoFlux/comfyui/__init__.py +6 -0
external_models/TangoFlux/comfyui/example_workflow.json +168 -0
external_models/TangoFlux/comfyui/install.py +79 -0
external_models/TangoFlux/comfyui/nodes.py +328 -0
external_models/TangoFlux/comfyui/requirements.txt +9 -0
external_models/TangoFlux/comfyui/server.py +64 -0
external_models/TangoFlux/comfyui/teacache.py +283 -0
external_models/TangoFlux/comfyui/web/js/playAudio.js +59 -0
external_models/TangoFlux/configs/__init__.py +0 -0
external_models/TangoFlux/configs/accelerator_config.yaml +17 -0
external_models/TangoFlux/configs/tangoflux_config.yaml +36 -0
external_models/TangoFlux/crpo.sh +2 -0
external_models/TangoFlux/inference.py +7 -0
external_models/TangoFlux/replicate_demo/cog.yaml +31 -0
external_models/TangoFlux/replicate_demo/predict.py +92 -0
external_models/TangoFlux/requirements.txt +12 -0
external_models/TangoFlux/setup.py +30 -0
external_models/TangoFlux/tangoflux/__init__.py +60 -0
external_models/TangoFlux/tangoflux/cli.py +29 -0
external_models/TangoFlux/tangoflux/demo.py +63 -0
external_models/TangoFlux/tangoflux/generate_crpo_dataset.py +204 -0
external_models/TangoFlux/tangoflux/label_crpo.py +153 -0
external_models/TangoFlux/tangoflux/model.py +556 -0
external_models/TangoFlux/tangoflux/train.py +588 -0
external_models/TangoFlux/tangoflux/train_dpo.py +608 -0
external_models/TangoFlux/tangoflux/utils.py +159 -0
external_models/TangoFlux/train.sh +2 -0
external_models/depth-fm/.gitignore +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+external_models/depth-fm/assets/dog.png filter=lfs diff=lfs merge=lfs -text
+external_models/depth-fm/assets/figures/dfm-cover.png filter=lfs diff=lfs merge=lfs -text
+external_models/depth-fm/assets/figures/radio.png filter=lfs diff=lfs merge=lfs -text
+external_models/TangoFlux/assets/tangoflux.png filter=lfs diff=lfs merge=lfs -text
+external_models/TangoFlux/assets/tf_opener.png filter=lfs diff=lfs merge=lfs -text
+external_models/TangoFlux/assets/tf_teaser.png filter=lfs diff=lfs merge=lfs -text

DepthEstimator.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+from accelerate.test_utils.testing import get_backend
+from PIL import Image
+import os
+import sys
+from config import LOGS_DIR, DEPTH_FM_CHECKPOINT, DEPTH_FM_DIR
+sys.path.append(DEPTH_FM_DIR + '/depthfm')
+from dfm import DepthFM
+from unet import UNetModel
+import einops
+import numpy as np
+from torchvision import transforms
+class DepthEstimator:
+    def __init__(self, image_dir = LOGS_DIR):
+        self.device,_,_ = get_backend()
+        self.image_dir = image_dir
+        self.model = None
+    def _load_model(self):
+        if self.model is None:
+            self.model = DepthFM(DEPTH_FM_CHECKPOINT).to(self.device).eval()
+        else:
+            self.model = self.model.to(self.device).eval()
+    def _unload_model(self):
+        if self.model is not None:
+            self.model = self.model.to("cpu")
+            torch.cuda.empty_cache()
+    def estimate_depth(self, image_path : str) -> list:
+        print("Estimating depth...")
+        predictions_list = []
+        self._load_model()
+        for img in os.listdir(image_path):
+            if img.endswith(".jpg") or img.endswith(".jpeg") or img.endswith(".png"):
+                image = Image.open(os.path.join(image_path, img))
+                x = np.array(image)
+                x = einops.rearrange(x, 'h w c -> c h w')
+                x = x / 127.5 - 1
+                x = torch.tensor(x, dtype=torch.float32)[None]
+                with torch.no_grad():
+                    depth = self.model.predict_depth(x.to(self.device), num_steps=2, ensemble_size=4) # returns a tensor
+                depth.cpu()
+                to_pil = transforms.ToPILImage()
+                PIL_image = to_pil(depth.squeeze())
+                predictions_list.append({"depth": PIL_image})
+                del x, depth
+                torch.cuda.empty_cache()
+        self._unload_model()
+        print("Depth estimation complete.")
+        return predictions_list
+    def visualize(self, predictions_list : list) -> None:
+        for (i, prediction) in enumerate(predictions_list):
+            prediction["depth"].save(f"depth_{i}.png")
+# Estimator = DepthEstimator()
+# predictions = Estimator.estimate_depth(Estimator.image_dir)
+# Estimator.visualize(predictions)

GenerateAudio.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import torchaudio
+import sys
+import torch
+import random
+from config import TANGO_FLUX_DIR
+sys.path.append(TANGO_FLUX_DIR)
+from tangoflux import TangoFluxInference
+from transformers import AutoTokenizer, T5EncoderModel
+from collections import Counter
+class GenerateAudio():
+    def __init__(self):
+        self.device = "cuda"
+        self.model = None
+        self.text_encoder = None
+        # Basic categories for object classification
+        self.categories = {
+            'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'],
+            'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'],
+            'urban': ['traffic', 'building', 'street', 'signal', 'construction'],
+            'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'],
+            'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'],
+            'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio']
+        }
+        # Suffixes and prefixes for pattern matching
+        self.suffixes = {
+            'tree': 'nature',
+            'bird': 'animal',
+            'car': 'vehicle',
+            'truck': 'vehicle',
+            'signal': 'urban'
+        }
+    def _load_model(self):
+        if self.model is None:
+            self.model = TangoFluxInference(name='declare-lab/TangoFlux')
+        if self.text_encoder is None:
+            self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval()
+        else:
+            self.text_encoder = self.text_encoder.to(self.device)
+    def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True):
+        self._load_model()
+        with torch.no_grad():
+            latents = self.model.model.inference_flow(
+                prompt,
+                duration=duration,
+                num_inference_steps=steps,
+                guidance_scale=guidance_scale,
+                disable_progress=disable_progress
+            )
+            wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
+        waveform_end = int(duration * self.model.vae.config.sampling_rate)
+        wave = wave[:, :waveform_end]
+        return wave
+    def _categorize_object(self, object_name):
+        """Categorize an object based on keywords or patterns"""
+        object_lower = object_name.lower()
+        # Check if the object contains any category keywords
+        for category, keywords in self.categories.items():
+            for keyword in keywords:
+                if keyword in object_lower:
+                    return category
+        # Check suffix/prefix patterns
+        words = object_lower.split()
+        for word in words:
+            for suffix, category in self.suffixes.items():
+                if word.endswith(suffix):
+                    return category
+        return "unknown"
+    def _describe_object_sound(self, object_name, zone):
+        """Generate an appropriate sound description based on object type and distance"""
+        category = self._categorize_object(object_name)
+        # Volume descriptor based on zone
+        volume_descriptors = {
+            "near": ["prominent", "clear", "loud", "distinct"],
+            "medium": ["moderate", "audible", "present"],
+            "far": ["subtle", "distant", "faint", "soft"]
+        }
+        volume = random.choice(volume_descriptors[zone])
+        # Sound descriptors based on category
+        sound_templates = {
+            "vehicle": [
+                "{volume} engine sounds from the {object}",
+                "{volume} mechanical noise of the {object}",
+                "the {object} creating {volume} road noise",
+                "{volume} sounds of the {object} in motion"
+            ],
+            "nature": [
+                "{volume} rustling of the {object}",
+                "the {object} making {volume} natural sounds",
+                "{volume} environmental sounds from the {object}",
+                "the {object} with {volume} movement in the wind"
+            ],
+            "urban": [
+                "{volume} urban sounds around the {object}",
+                "the {object} with {volume} city ambience",
+                "{volume} noise from the {object}",
+                "the {object} contributing to {volume} street sounds"
+            ],
+            "animal": [
+                "{volume} calls from the {object}",
+                "the {object} making {volume} animal sounds",
+                "{volume} sounds of the {object}",
+                "the {object} with its {volume} presence"
+            ],
+            "human": [
+                "{volume} voices from the {object}",
+                "the {object} creating {volume} human sounds",
+                "{volume} movement sounds from the {object}",
+                "the {object} with {volume} activity"
+            ],
+            "indoor": [
+                "{volume} ambient sounds around the {object}",
+                "the {object} making {volume} indoor noises",
+                "{volume} mechanical sounds from the {object}",
+                "the {object} with its {volume} presence"
+            ],
+            "unknown": [
+                "{volume} sounds from the {object}",
+                "the {object} creating {volume} audio",
+                "{volume} noises associated with the {object}",
+                "the {object} with its {volume} acoustic presence"
+            ]
+        }
+        # Select a template for this category
+        templates = sound_templates.get(category, sound_templates["unknown"])
+        template = random.choice(templates)
+        # Fill in the template
+        description = template.format(volume=volume, object=object_name)
+        return description
+    def create_audio_prompt(self, object_depths):
+        if not object_depths:
+            return "Environmental ambient sounds."
+        for obj in object_depths:
+            if obj.get("sound_description") and len(obj["sound_description"]) > 5:
+                return obj["sound_description"]
+        return f"Sounds of {object_depths[0]['original_label']}."
+    def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5):
+        self._load_model()
+        if not object_depths:
+            prompt = "Environmental ambient sounds."
+        else:
+            # Sort objects by depth to prioritize closer objects
+            sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"])
+            prompt = self.create_audio_prompt(sorted_objects)
+        print(f"Generated audio prompt: {prompt}")
+        wave = self.generate_sound(
+            prompt,
+            steps=steps,
+            duration=duration,
+            guidance_scale=guidance_scale
+        )
+        sample_rate = self.model.vae.config.sampling_rate
+        if output_path:
+            torchaudio.save(
+                output_path,
+                wave.unsqueeze(0),
+                sample_rate
+            )
+            print(f"Audio saved to: {output_path}")
+        return wave, sample_rate

GenerateCaptions.py ADDED Viewed

	@@ -0,0 +1,494 @@

+#!/usr/bin/env python3
+"""
+streetsoundtext.py - A pipeline that downloads Google Street View panoramas,
+extracts perspective views, and analyzes them for sound information.
+"""
+import os
+import requests
+import argparse
+import numpy as np
+import torch
+import time
+from PIL import Image
+from io import BytesIO
+from config import LOGS_DIR
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from utils import sample_perspective_img
+import cv2
+log_dir = LOGS_DIR
+os.makedirs(log_dir, exist_ok=True)  # Creates the directory if it doesn't exist
+# soundscape_query = "<image>\nWhat can we expect to hear from the location captured in this image? Name the around five nouns. Avoid speculation and provide a concise response including sound sources visible in the image."
+soundscape_query = """<image>
+Identify 5 potential sound sources visible in this image. For each source, provide both the noun and a brief description of its typical sound.
+Format your response exactly like these examples (do not include the word "Noun:" in your response):
+Car: engine humming with occasional honking.
+River: gentle flowing water with subtle splashing sounds.
+Trees: rustling leaves moved by the wind.
+"""
+# Constants
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+# Model Leaderboard Paths
+MODEL_LEADERBOARD = {
+    "intern_2_5-8B": "OpenGVLab/InternVL2_5-8B-MPO",
+    "intern_2_5-4B": "OpenGVLab/InternVL2_5-4B-MPO",
+}
+class StreetViewDownloader:
+    """Downloads panoramic images from Google Street View"""
+    def __init__(self):
+        # URLs for API requests
+        # https://www.google.ca/maps/rpc/photo/listentityphotos?authuser=0&hl=en&gl=us&pb=!1e3!5m45!2m2!1i203!2i100!3m3!2i4!3sCAEIBAgFCAYgAQ!5b1!7m33!1m3!1e1!2b0!3e3!1m3!1e2!2b1!3e2!1m3!1e2!2b0!3e3!1m3!1e8!2b0!3e3!1m3!1e10!2b0!3e3!1m3!1e10!2b1!3e2!1m3!1e10!2b0!3e4!1m3!1e9!2b1!3e2!2b1!8m0!9b0!11m1!4b1!6m3!1sI63QZ8b4BcSli-gPvPHf-Qc!7e81!15i11021!9m2!2d-90.30324219145255!3d38.636242944711036!10d91.37627840655999
+        #self.panoid_req = 'https://www.google.com/maps/preview/reveal?authuser=0&hl=en&gl=us&pb=!2m9!1m3!1d82597.14038230096!2d{}!3d{}!2m0!3m2!1i1523!2i1272!4f13.1!3m2!2d{}!3d{}!4m2!1syPETZOjwLvCIptQPiJum-AQ!7e81!5m5!2m4!1i96!2i64!3i1!4i8'
+        self.panoid_req = 'https://www.google.ca/maps/rpc/photo/listentityphotos?authuser=0&hl=en&gl=us&pb=!1e3!5m45!2m2!1i203!2i100!3m3!2i4!3sCAEIBAgFCAYgAQ!5b1!7m33!1m3!1e1!2b0!3e3!1m3!1e2!2b1!3e2!1m3!1e2!2b0!3e3!1m3!1e8!2b0!3e3!1m3!1e10!2b0!3e3!1m3!1e10!2b1!3e2!1m3!1e10!2b0!3e4!1m3!1e9!2b1!3e2!2b1!8m0!9b0!11m1!4b1!6m3!1sI63QZ8b4BcSli-gPvPHf-Qc!7e81!15i11021!9m2!2d{}!3d{}!10d25'
+        #                     https://www.google.com/maps/photometa/v1?authuser=0&hl=en&gl=us&pb=!1m4!1smaps_sv.tactile!11m2!2m1!1b1!2m2!1sen!2sus!3m3!1m2!1e2!2s{}!4m61!1e1!1e2!1e3!1e4!1e5!1e6!1e8!1e12!1e17!2m1!1e1!4m1!1i48!5m1!1e1!5m1!1e2!6m1!1e1!6m1!1e2!9m36!1m3!1e2!2b1!3e2!1m3!1e2!2b0!3e3!1m3!1e3!2b1!3e2!1m3!1e3!2b0!3e3!1m3!1e8!2b0!3e3!1m3!1e1!2b0!3e3!1m3!1e4!2b0!3e3!1m3!1e10!2b1!3e2!1m3!1e10!2b0!3e3!11m2!3m1!4b1 # vmSzE7zkK2eETwAP_r8UdQ
+        #                     https://www.google.ca/maps/photometa/v1?authuser=0&hl=en&gl=us&pb=!1m4!1smaps_sv.tactile!11m2!2m1!1b1!2m2!1sen!2sus!3m3!1m2!1e2!2s{}!4m61!1e1!1e2!1e3!1e4!1e5!1e6!1e8!1e12!1e17!2m1!1e1!4m1!1i48!5m1!1e1!5m1!1e2!6m1!1e1!6m1!1e2!9m36!1m3!1e2!2b1!3e2!1m3!1e2!2b0!3e3!1m3!1e3!2b1!3e2!1m3!1e3!2b0!3e3!1m3!1e8!2b0!3e3!1m3!1e1!2b0!3e3!1m3!1e4!2b0!3e3!1m3!1e10!2b1!3e2!1m3!1e10!2b0!3e3!11m2!3m1!4b1 # -9HfuNFUDOw_IP5SA5IspA
+        self.photometa_req = 'https://www.google.com/maps/photometa/v1?authuser=0&hl=en&gl=us&pb=!1m4!1smaps_sv.tactile!11m2!2m1!1b1!2m2!1sen!2sus!3m5!1m2!1e2!2s{}!2m1!5s0x87d8b49f53fc92e9:0x6ecb6e520c6f4d9f!4m57!1e1!1e2!1e3!1e4!1e5!1e6!1e8!1e12!2m1!1e1!4m1!1i48!5m1!1e1!5m1!1e2!6m1!1e1!6m1!1e2!9m36!1m3!1e2!2b1!3e2!1m3!1e2!2b0!3e3!1m3!1e3!2b1!3e2!1m3!1e3!2b0!3e3!1m3!1e8!2b0!3e3!1m3!1e1!2b0!3e3!1m3!1e4!2b0!3e3!1m3!1e10!2b1!3e2!1m3!1e10!2b0!3e3'
+        self.panimg_req = 'https://streetviewpixels-pa.googleapis.com/v1/tile?cb_client=maps_sv.tactile&panoid={}&x={}&y={}&zoom={}'
+    def get_image_id(self, lat, lon):
+        """Get Street View panorama ID for given coordinates"""
+        null = None
+        pr_response = requests.get(self.panoid_req.format(lon, lat, lon, lat))
+        if pr_response.status_code != 200:
+            error_message = f"Error fetching panorama ID: HTTP {pr_response.status_code}"
+            if pr_response.status_code == 400:
+                error_message += " - Bad request. Check coordinates format."
+            elif pr_response.status_code == 401 or pr_response.status_code == 403:
+                error_message += " - Authentication error. Check API key and permissions."
+            elif pr_response.status_code == 404:
+                error_message += " - No panorama found at these coordinates."
+            elif pr_response.status_code == 429:
+                error_message += " - Rate limit exceeded. Try again later."
+            elif pr_response.status_code >= 500:
+                error_message += " - Server error. Try again later."
+            return None
+        pr = BytesIO(pr_response.content).getvalue().decode('utf-8')
+        pr = eval(pr[pr.index('\n'):])
+        try:
+            panoid = pr[0][0][0]
+        except:
+            return None
+        return panoid
+    def download_image(self, lat, lon, zoom=1):
+        """Download Street View panorama and metadata"""
+        null = None
+        panoid = self.get_image_id(lat, lon)
+        if panoid is None:
+            raise ValueError(f"get_image_id failed() at coordinates: {lat}, {lon}")
+        # Get metadata
+        pm_response = requests.get(self.photometa_req.format(panoid))
+        pm = BytesIO(pm_response.content).getvalue().decode('utf-8')
+        pm = eval(pm[pm.index('\n'):])
+        pan_list = pm[1][0][5][0][3][0]
+        # Extract relevant info
+        pid = pan_list[0][0][1]
+        plat = pan_list[0][2][0][2]
+        plon = pan_list[0][2][0][3]
+        p_orient = pan_list[0][2][2][0]
+        # Download image tiles and assemble panorama
+        img_part_inds = [(x, y) for x in range(2**zoom) for y in range(2**(zoom-1))]
+        img = np.zeros((512*(2**(zoom-1)), 512*(2**zoom), 3), dtype=np.uint8)
+        for x, y in img_part_inds:
+            sub_img_response = requests.get(self.panimg_req.format(pid, x, y, zoom))
+            sub_img = np.array(Image.open(BytesIO(sub_img_response.content)))
+            img[512*y:512*(y+1), 512*x:512*(x+1)] = sub_img
+        if (img[-1] == 0).all():
+            # raise ValueError("Failed to download complete panorama")
+            print("Failed to download complete panorama")
+        return img, pid, plat, plon, p_orient
+class PerspectiveExtractor:
+    """Extracts perspective views from panoramic images"""
+    def __init__(self, output_shape=(256, 256), fov=(90, 90)):
+        self.output_shape = output_shape
+        self.fov = fov
+    def extract_views(self, pano_img, face_size=512):
+        """Extract front, back, left, and right views based on orientation"""
+        # orientations = {
+        #     "front": (0, p_orient, 0),       # Align front with real orientation
+        #     "back": (0, p_orient + 180, 0),  # Behind
+        #     "left": (0, p_orient - 90, 0),   # Left side
+        #     "right": (0, p_orient + 90, 0),  # Right side
+        # }
+        # cutouts = {}
+        # for view, rot in orientations.items():
+        #     cutout, fov, applied_rot = sample_perspective_img(
+        #         pano_img, self.output_shape, fov=self.fov, rot=rot
+        #     )
+        #     cutouts[view] = cutout
+        # return cutouts
+        """
+        Convert ERP panorama to four cubic faces: Front, Left, Back, Right.
+        Args:
+            erp_img (numpy.ndarray): The input equirectangular image.
+            face_size (int): The size of each cubic face.
+        Returns:
+            dict: A dictionary with the four cube faces.
+        """
+        # Get ERP dimensions
+        h_erp, w_erp, _ = pano_img.shape
+        # Define cube face directions (yaw, pitch, roll)
+        cube_faces = {
+            "front":  (0, 0),
+            "left":   (90, 0),
+            "back":   (180, 0),
+            "right":  (-90, 0),
+        }
+        # Output faces
+        faces = {}
+        # Generate each face
+        for face_name, (yaw, pitch) in cube_faces.items():
+            # Create a perspective transformation matrix
+            fov = 90  # Field of view
+            K = np.array([
+                [face_size / (2 * np.tan(np.radians(fov / 2))), 0, face_size / 2],
+                [0, face_size / (2 * np.tan(np.radians(fov / 2))), face_size / 2],
+                [0, 0, 1]
+            ])
+            # Generate 3D world coordinates for the cube face
+            x, y = np.meshgrid(np.linspace(-1, 1, face_size), np.linspace(-1, 1, face_size))
+            z = np.ones_like(x)
+            # Normalize 3D points
+            points_3d = np.stack((x, y, z), axis=-1)  # Shape: (H, W, 3)
+            points_3d /= np.linalg.norm(points_3d, axis=-1, keepdims=True)
+            # Apply rotation to align with the cube face
+            yaw_rad, pitch_rad = np.radians(yaw), np.radians(pitch)
+            Ry = np.array([[np.cos(yaw_rad), 0, np.sin(yaw_rad)], [0, 1, 0], [-np.sin(yaw_rad), 0, np.cos(yaw_rad)]])
+            Rx = np.array([[1, 0, 0], [0, np.cos(pitch_rad), -np.sin(pitch_rad)], [0, np.sin(pitch_rad), np.cos(pitch_rad)]])
+            R = Ry @ Rx
+            # Rotate points
+            points_3d_rot = np.einsum('ij,hwj->hwi', R, points_3d)
+            # Convert 3D to spherical coordinates
+            lon = np.arctan2(points_3d_rot[..., 0], points_3d_rot[..., 2])
+            lat = np.arcsin(points_3d_rot[..., 1])
+            # Map spherical coordinates to ERP image coordinates
+            x_erp = (w_erp * (lon / (2 * np.pi) + 0.5)).astype(np.float32)
+            y_erp = (h_erp * (0.5 - lat / np.pi)).astype(np.float32)
+            # Sample pixels from ERP image
+            face_img = cv2.remap(pano_img, x_erp, y_erp, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_WRAP)
+            cv2.rotate(face_img, cv2.ROTATE_180, face_img)
+            faces[face_name] = face_img
+        return faces
+class ImageAnalyzer:
+    """Analyzes images using Vision-Language Models"""
+    def __init__(self, model_name="intern_2_5-4B", use_cuda=True):
+        self.model_name = model_name
+        self.use_cuda = use_cuda and torch.cuda.is_available()
+        self.model, self.tokenizer, self.device = self._load_model()
+    def _load_model(self):
+        """Load selected Vision-Language Model"""
+        if self.model_name not in MODEL_LEADERBOARD:
+            raise ValueError(f"Model '{self.model_name}' not found. Choose from: {list(MODEL_LEADERBOARD.keys())}")
+        model_path = MODEL_LEADERBOARD[self.model_name]
+        # Configure device and parameters
+        if self.use_cuda:
+            device = torch.device("cuda")
+            torch_dtype = torch.bfloat16
+            use_flash_attn = True
+        else:
+            device = torch.device("cpu")
+            torch_dtype = torch.float32
+            use_flash_attn = False
+        # Load model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch_dtype,
+            load_in_8bit=False,
+            low_cpu_mem_usage=True,
+            use_flash_attn=use_flash_attn,
+            trust_remote_code=True,
+        ).eval().to(device)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            use_fast=False
+        )
+        return model, tokenizer, device
+    def _build_transform(self, input_size=448):
+        """Create image transformation pipeline"""
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+        ])
+        return transform
+    def _find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        """Find closest aspect ratio for image tiling"""
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def _preprocess_image(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        """Preprocess image for model input"""
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # Calculate possible image aspect ratios
+        target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # Find closest aspect ratio
+        target_aspect_ratio = self._find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
+        # Calculate target dimensions
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # Resize and split image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def load_image(self, image_path, input_size=448, max_num=12):
+        """Load and process image for analysis"""
+        image = Image.open(image_path).convert('RGB')
+        transform = self._build_transform(input_size)
+        images = self._preprocess_image(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return pixel_values
+    def analyze_image(self, image_path, max_num=12):
+        """Analyze image for expected sounds"""
+        # Load and process image
+        pixel_values = self.load_image(image_path, max_num=max_num)
+        # Move to device with appropriate dtype
+        if self.device.type == "cuda":
+            pixel_values = pixel_values.to(torch.bfloat16).to(self.device)
+        else:
+            pixel_values = pixel_values.to(torch.float32).to(self.device)
+        # Create sound-focused query
+        query = soundscape_query
+        # Generate response
+        generation_config = dict(max_new_tokens=1024, do_sample=True)
+        response = self.model.chat(self.tokenizer, pixel_values, query, generation_config)
+        return response
+class StreetSoundTextPipeline:
+    """Complete pipeline for Street View sound analysis"""
+    def __init__(self, log_dir="logs", model_name="intern_2_5-4B", use_cuda=True):
+        # Create log directory if it doesn't exist
+        self.log_dir = log_dir
+        os.makedirs(log_dir, exist_ok=True)
+        # Initialize components
+        self.downloader = StreetViewDownloader()
+        self.extractor = PerspectiveExtractor()
+        # self.analyzer = ImageAnalyzer(model_name=model_name, use_cuda=use_cuda)
+        self.analyzer = None
+        self.model_name = model_name
+        self.use_cuda = use_cuda
+    def _load_analyzer(self):
+        if self.analyzer is None:
+            self.analyzer = ImageAnalyzer(model_name=self.model_name, use_cuda=self.use_cuda)
+    def _unload_analyzer(self):
+        if self.analyzer is not None:
+            if hasattr(self.analyzer, 'model') and self.analyzer.model is not None:
+                self.analyzer.model = self.analyzer.model.to("cpu")
+                del self.analyzer.model
+                self.analyzer.model = None
+        torch.cuda.empty_cache()
+        self.analyzer = None
+    def process(self, lat, lon, view, panoramic=False):
+        """
+        Process a location to generate sound description for specified view or all views
+        Args:
+            lat (float): Latitude
+            lon (float): Longitude
+            view (str): Perspective view ('front', 'back', 'left', 'right')
+            panoramic (bool): If True, process all views instead of just the specified one
+        Returns:
+            dict or list: Results including panorama info and sound description(s)
+        """
+        if view not in ["front", "back", "left", "right"]:
+            raise ValueError(f"Invalid view: {view}. Choose from: front, back, left, right")
+        # Step 1: Download panoramic image
+        print(f"Downloading Street View panorama for coordinates: {lat}, {lon}")
+        pano_path = os.path.join(self.log_dir, "panorama.jpg")
+        pano_img, pid, plat, plon, p_orient = self.downloader.download_image(lat, lon)
+        Image.fromarray(pano_img).save(pano_path)
+        # Step 2: Extract perspective views
+        print(f"Extracting perspective views with orientation: {p_orient}°")
+        cutouts = self.extractor.extract_views(pano_img, 512)
+        # Save all views
+        for v, img in cutouts.items():
+            view_path = os.path.join(self.log_dir, f"{v}.jpg")
+            Image.fromarray(img).save(view_path)
+        self._load_analyzer()
+        print("\n[DEBUG] Current soundscape query:")
+        print(soundscape_query)
+        print("-" * 50)
+        if panoramic:
+            # Process all views
+            print(f"Analyzing all views for sound information")
+            results = []
+            for current_view in ["front", "back", "left", "right"]:
+                view_path = os.path.join(self.log_dir, f"{current_view}.jpg")
+                sound_description = self.analyzer.analyze_image(view_path)
+                view_result = {
+                    "panorama_id": pid,
+                    "coordinates": {"lat": plat, "lon": plon},
+                    "orientation": p_orient,
+                    "view": current_view,
+                    "sound_description": sound_description,
+                    "files": {
+                        "panorama": pano_path,
+                        "view_path": view_path
+                    }
+                }
+                results.append(view_result)
+            self._unload_analyzer()
+            return results
+        else:
+            # Process only the selected view
+            view_path = os.path.join(self.log_dir, f"{view}.jpg")
+            print(f"Analyzing {view} view for sound information")
+            sound_description = self.analyzer.analyze_image(view_path)
+            self._unload_analyzer()
+            # Prepare results
+            results = {
+                "panorama_id": pid,
+                "coordinates": {"lat": plat, "lon": plon},
+                "orientation": p_orient,
+                "view": view,
+                "sound_description": sound_description,
+                "files": {
+                    "panorama": pano_path,
+                    "views": {v: os.path.join(self.log_dir, f"{v}.jpg") for v in cutouts.keys()}
+                }
+            }
+            return results
+def parse_location(location_str):
+    """Parse location string in format 'lat,lon' into float tuple"""
+    try:
+        lat, lon = map(float, location_str.split(','))
+        return lat, lon
+    except ValueError:
+        raise argparse.ArgumentTypeError("Location must be in format 'latitude,longitude'")
+def generate_caption(lat, lon, view="front", model="intern_2_5-4B", cpu_only=False, panoramic=False):
+    """
+    Generate sound captions for one or all views of a street view location
+    Args:
+        lat (float/str): Latitude
+        lon (float/str): Longitude
+        view (str): Perspective view ('front', 'back', 'left', 'right')
+        model (str): Model name to use for analysis
+        cpu_only (bool): Whether to force CPU usage
+        panoramic (bool): If True, process all views instead of just the specified one
+    Returns:
+        dict or list: Results with sound descriptions
+    """
+    pipeline = StreetSoundTextPipeline(
+        log_dir=log_dir,
+        model_name=model,
+        use_cuda=not cpu_only
+    )
+    try:
+        results = pipeline.process(lat, lon, view, panoramic=panoramic)
+        if panoramic:
+            # Process results for all views
+            print(f"Generated captions for all views at location: {lat}, {lon}")
+        else:
+            print(f"Generated caption for {view} view at location: {lat}, {lon}")
+        return results
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None

README.md CHANGED Viewed

@@ -1,13 +1,50 @@
----
-title: SoundingStreet
-emoji: 🏢
-colorFrom: gray
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.26.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+**A training-free pipeline utilizing pre-trained generative models to synthesize sound for any street on Earth with available Street View panoramic images.**
+1.  Change to this directory:
+    ```
+    cd SoundingStreet
+    ```
+2. Create the conda environment:
+    ```
+    conda env create -f environment.yml
+    conda activate geosynthsound
+    ```
+3. Make sure to create necessary directories:
+    ```
+    mkdir -p logs output
+    ```
+4. Download checkpoint for depth estimator model:
+    ```
+    wget https://ommer-lab.com/files/depthfm/depthfm-v1.ckpt -P external_models/depth-fm/checkpoints/
+    ```
+5. Run the `SoundingStreet` demo:
+    ```
+    python main.py --panoramic --location "52.3436723,4.8529625"
+    ```
+    Intermediate files such as the downloaded panoramic image and perspective cut-outs can be found in `./logs/`, and output audios for each view as well as the composite audio for the location are saved as `./output/panoramic_composition.wav`
+## Acknowledgements
+- **InternVL2.5-8B-MPO**
+  For vision-language modeling, we employ InternVL2.5-8B-MPO, which is released under the MIT License.
+  GitHub: https://github.com/OpenGVLab/InternVL
+- **Grounding DINO**
+  We use Grounding DINO for open-set object detection. Grounding DINO is released under the Apache 2.0 License.
+  GitHub: https://github.com/IDEA-Research/GroundingDINO
+- **DepthFM**
+  We utilize the DepthFM model for monocular depth estimation. DepthFM is released under the MIT License.
+  GitHub: https://github.com/CompVis/depth-fm
+- **TangoFlux**
+  We incorporate TangoFlux for text-to-audio generation. TangoFlux is available for non-commercial research use only and is subject to the Stability AI Community License, WavCaps license, and the original licenses of the datasets used in training.
+  GitHub: https://github.com/declare-lab/TangoFlux
+Our repository's license and usage terms adhere to the respective licenses of these models.

SoundMapper.py ADDED Viewed

	@@ -0,0 +1,438 @@

+from DepthEstimator import DepthEstimator
+import numpy as np
+from PIL import Image
+import os
+from GenerateCaptions import generate_caption
+import re
+from config import LOGS_DIR
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import spacy
+import gc
+class SoundMapper:
+    def __init__(self):
+        self.depth_estimator = DepthEstimator()
+        # List of depth maps in dict["predicted_depth" ,"depth"] in (tensor, PIL.Image) format
+        self.device = "cuda"
+        # self.map_list = self.depth_estimator.estimate_depth(self.depth_estimator.image_dir)
+        self.map_list = None
+        self.image_dir = self.depth_estimator.image_dir
+        # self.nlp = spacy.load("en_core_web_sm")
+        self.nlp = None
+        self.dino = None
+        self.dino_processor = None
+        # self.dino = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(self.device)
+        # self.dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+    def _load_nlp(self):
+        if self.nlp is None:
+            self.nlp = spacy.load("en_core_web_sm")
+        return self.nlp
+    def _load_depth_maps(self):
+        if self.map_list is None:
+            self.map_list = self.depth_estimator.estimate_depth(self.depth_estimator.image_dir)
+        return self.map_list
+    def process_depth_maps(self) -> list:
+        depth_maps = self._load_depth_maps()
+        processed_maps = []
+        for item in depth_maps:
+            depth_map = item["depth"]
+            depth_array = np.array(depth_map)
+            normalization = depth_array / 255.0
+            processed_maps.append({
+                    "original": depth_map,
+                    "normalization": normalization
+                })
+        return processed_maps
+    # def create_depth_zone(self, processed_maps : list, num_zones = 3):
+    #     zones_data = []
+    #     for depth_data in processed_maps:
+    #         normalized = depth_data["normalization"]
+    #         thresholds = np.linspace(0, 1, num_zones+1)
+    #         zones = []
+    #         for i in range(num_zones):
+    #             zone_mask = (normalized >= thresholds[i]) & (normalized < thresholds[i+1])
+    #             zone_percentage = zone_mask.sum() / zone_mask.size
+    #             zones.append({
+    #                 "range": (thresholds[i], thresholds[i+1]),
+    #                 "percentage": zone_percentage,
+    #                 "mask": zone_mask
+    #             })
+    #         zones_data.append(zones)
+    #     return zones_data
+    def detect_sound_sources(self, caption_text: str) -> dict:
+        """
+        Extract nouns and their sound descriptions from caption text.
+        Returns a dictionary mapping nouns to their descriptions.
+        """
+        sound_sources = {}
+        nlp = self._load_nlp()
+        print(f"\n[DEBUG] Beginning sound source detection")
+        print(f"Raw caption text length: {len(caption_text)}")
+        print(f"First 100 chars: {caption_text[:100]}...")
+        # Split the caption by newlines to separate entries
+        lines = caption_text.strip().split('\n')
+        print(f"Found {len(lines)} lines after splitting")
+        for i, line in enumerate(lines):
+            # Skip empty lines
+            if not line.strip():
+                continue
+            print(f"Processing line {i}: {line[:50]}{'...' if len(line) > 50 else ''}")
+            # Check if line matches the expected format (Noun: description)
+            if ':' in line:
+                parts = line.split(':', 1)  # Split only on the first colon
+                # Clean up the noun part - remove numbers and leading/trailing whitespace
+                noun_part = parts[0].strip().lower()
+                # Remove list numbering (e.g., "1. ", "2. ", etc.)
+                noun_part = re.sub(r'^\d+\.\s*', '', noun_part)
+                description = parts[1].strip()
+                # Clean any markdown formatting
+                noun = re.sub(r'[*()]', '', noun_part).strip()
+                description = re.sub(r'[*()]', '', description).strip()
+                # Separate the description at em dash if present
+                if ' — ' in description:
+                    description = description.split(' — ', 1)[0].strip()
+                elif ' - ' in description:
+                    description = description.split(' - ', 1)[0].strip()
+                print(f"  - Found potential noun: '{noun}' with description: '{description[:30]}...'")
+                # Skip if noun contains invalid characters or is too short
+                if '##' not in noun and len(noun) > 1 and noun[0].isalpha():
+                    sound_sources[noun] = description
+                    print(f"    √ Added to sound sources")
+                else:
+                    print(f"    × Skipped (invalid format)")
+        # If no structured format found, try to extract nouns from the text
+        if not sound_sources:
+            print("No structured format found, falling back to noun extraction")
+            all_nouns = []
+            doc = nlp(caption_text)
+            for token in doc:
+                if token.pos_ == "NOUN" and len(token.text) > 1:
+                    if token.text[0].isalpha():
+                        all_nouns.append(token.text.lower())
+                        print(f"  - Extracted noun: '{token.text.lower()}'")
+            for noun in all_nouns:
+                sound_sources[noun] = ""  # Empty description
+        print(f"[DEBUG] Final detected sound sources: {list(sound_sources.keys())}")
+        return sound_sources
+    def map_bbox_to_depth_zone(self, bbox, depth_map, num_zones=3):
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        height, width = depth_map.shape
+        x1, y1 = max(0, x1), max(0, y1)
+        x2, y2 = min(width, x2), min(height, y2)
+        depth_roi = depth_map[y1:y2, x1:x2]
+        if depth_roi.size == 0:
+            return num_zones - 1
+        mean_depth = np.mean(depth_roi)
+        thresholds = self.create_histogram_depth_zones(depth_map, num_zones)
+        for i in range(num_zones):
+            if thresholds[i] <= mean_depth < thresholds[i+1]:
+                return i
+        return num_zones - 1
+    def detect_objects(self, nouns : list, image: Image):
+        filtered_nouns = []
+        for noun in nouns:
+            if '##' not in noun and len(noun) > 1 and noun[0].isalpha():
+                filtered_nouns.append(noun)
+        print(f"Detecting objects for nouns: {filtered_nouns}")
+        if self.dino is None:
+            self.dino = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(self.device)
+            self.dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
+        else:
+            self.dino = self.dino.to(self.device)
+        text_prompt = " . ".join(filtered_nouns)
+        inputs = self.dino_processor(images=image, text=text_prompt, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.dino(**inputs)
+            results = self.dino_processor.post_process_grounded_object_detection(
+                outputs,
+                inputs.input_ids,
+                box_threshold=0.25,
+                text_threshold=0.25,
+                target_sizes=[image.size[::-1]]
+            )
+        result = results[0]
+        labels = result["labels"]
+        bboxes = result["boxes"]
+        clean_labels = []
+        for label in labels:
+            clean_label = re.sub(r'##\w+', '', label)
+            clean_label = self._split_combined_words(clean_label, filtered_nouns)
+            clean_labels.append(clean_label)
+        self.dino = self.dino.to("cpu")
+        torch.cuda.empty_cache()
+        del inputs, outputs, results
+        print(f"Detected objects: {clean_labels}")
+        return (clean_labels, bboxes)
+    def _split_combined_words(self, text, nouns=None):
+        nlp = self._load_nlp()
+        if nouns is None:
+            known_words = set()
+            doc = nlp(text)
+            for token in doc:
+                if token.pos_ == "NOUN" and len(token.text) > 1:
+                    known_words.add(token.text.lower())
+        else:
+            known_words = set(nouns)
+        result = []
+        for word in text.split():
+            if word in known_words:
+                result.append(word)
+                continue
+            found = False
+            for known in known_words:
+                if known in word and len(known) > 2:
+                    result.append(known)
+                    found = True
+            if not found:
+                result.append(word)
+        return " ".join(result)
+    def process_dino_labels(self, labels):
+        processed_labels = []
+        nlp = self._load_nlp()
+        for label in labels:
+            if label.startswith('##'):
+                continue
+            label = re.sub(r'[*()]', '', label).strip()
+            parts = label.split()
+            for part in parts:
+                if part.startswith('##'):
+                    continue
+                doc = nlp(part)
+                for token in doc:
+                    if token.pos_ == "NOUN" and len(token.text) > 1:
+                        processed_labels.append(token.text.lower())
+        unique_labels = []
+        for label in processed_labels:
+            if label not in unique_labels:
+                unique_labels.append(label)
+        return unique_labels
+    def create_histogram_depth_zones(self, depth_map, num_zones = 3):
+        # using 50 bins because it is faster
+        hist, bin_edge = np.histogram(depth_map.flatten(), bins=50, range=(0, 1))
+        cumulative = np.cumsum(hist) / np.sum(hist)
+        thresholds = [0.0]
+        for i in range(1, num_zones):
+            target = i / num_zones
+            idx = np.argmin(np.abs(cumulative - target))
+            thresholds.append(bin_edge[idx + 1])
+        thresholds.append(1.0)
+        return thresholds
+    def analyze_object_depths(self, image_path, depth_map, lat, lon, caption_data=None, all_objects=False):
+        image = Image.open(image_path)
+        if caption_data is None:
+            caption = generate_caption(lat, lon)
+            if not caption:
+                print(f"Failed to generate caption for {image_path}")
+                return []
+            caption_text = caption.get("sound_description", "")
+        else:
+            caption_text = caption_data.get("sound_description", "")
+        # Debug: Print the raw caption text
+        print(f"\n[DEBUG] Raw caption text for {os.path.basename(image_path)}:")
+        print(caption_text)
+        print("-" * 50)
+        if not caption_text:
+            print(f"No caption text available for {image_path}")
+            return []
+        # Extract nouns and their sound descriptions
+        sound_sources = self.detect_sound_sources(caption_text)
+        # Debug: Print the extracted sound sources
+        print(f"[DEBUG] Extracted sound sources:")
+        for noun, desc in sound_sources.items():
+            print(f"  - {noun}: {desc}")
+        print("-" * 50)
+        if not sound_sources:
+            print(f"No sound sources detected in caption for {image_path}")
+            return []
+        # Get list of nouns only for object detection
+        nouns = list(sound_sources.keys())
+        # Debug: Print the list of nouns being used for detection
+        print(f"[DEBUG] Nouns for object detection: {nouns}")
+        print("-" * 50)
+        labels, bboxes = self.detect_objects(nouns, image)
+        if len(labels) == 0 or len(bboxes) == 0:
+            print(f"No objects detected in {image_path}")
+            return []
+        object_data = []
+        known_objects = set(nouns) if nouns else set()
+        for i, (label, bbox) in enumerate(zip(labels, bboxes)):
+            if '##' in label:
+                continue
+            x1, y1, x2, y2 = [int(coord) for coord in bbox]
+            height, width = depth_map.shape
+            x1, y1 = max(0, x1), max(0, y1)
+            x2, y2 = min(width, x2), min(height, y2)
+            depth_roi = depth_map[y1:y2, x1:x2]
+            if depth_roi.size == 0:
+                continue
+            mean_depth = np.mean(depth_roi)
+            matched_noun = None
+            matched_desc = None
+            for word in label.split():
+                word = word.lower()
+                if word in sound_sources:
+                    matched_noun = word
+                    matched_desc = sound_sources[word]
+                    break
+            if matched_noun is None:
+                for noun in sound_sources:
+                    if noun in label.lower():
+                        matched_noun = noun
+                        matched_desc = sound_sources[noun]
+                        break
+            if matched_noun is None:
+                for word in label.split():
+                    if len(word) > 1 and word[0].isalpha() and '##' not in word:
+                        matched_noun = word.lower()
+                        matched_desc = ""  # No description available
+                        break
+            if matched_noun:
+                thresholds = self.create_histogram_depth_zones(depth_map, num_zones=3)
+                zone = 0  # The default is 0 which is the closest zone
+                for i in range(3):
+                    if thresholds[i] <= mean_depth < thresholds[i+1]:
+                        zone = i
+                        break
+                object_data.append({
+                    "original_label": matched_noun,
+                    "bbox": bbox.tolist(),
+                    "depth_zone": zone,
+                    "zone_description": ["near", "medium", "far"][zone],
+                    "mean_depth": mean_depth,
+                    "weight": 1.0 - mean_depth,
+                    "sound_description": matched_desc
+                })
+        if all_objects:
+            object_data.sort(key=lambda x: x["mean_depth"])
+            return object_data
+        else:
+            if not object_data:
+                return []
+            closest_object = min(object_data, key=lambda x: x["mean_depth"])
+            return [closest_object]
+    def cleanup(self):
+        if hasattr(self, 'depth_estimator') and self.depth_estimator is not None:
+            del self.depth_estimator
+            self.depth_estimator = None
+        if self.map_list is not None:
+            del self.map_list
+            self.map_list = None
+        if self.dino is not None:
+            self.dino = self.dino.to("cpu")
+            del self.dino
+            self.dino = None
+            del self.dino_processor
+            self.dino_processor = None
+        if self.nlp is not None:
+            del self.nlp
+            self.nlp = None
+        torch.cuda.empty_cache()
+        gc.collect()
+    def test_object_depth_analysis(self):
+        """
+        Test the object depth analysis on all images in the directory.
+        """
+        # Process depth maps first
+        processed_maps = self.process_depth_maps()
+        # Get list of original image paths
+        image_dir = self.depth_estimator.image_dir
+        image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
+        results = []
+        # For each image and its corresponding depth map
+        for i, (image_path, processed_map) in enumerate(zip(image_paths, processed_maps)):
+            # Extract the normalized depth map
+            depth_map = processed_map["normalization"]
+            # Analyze objects and their depths
+            object_depths = self.analyze_object_depths(image_path, depth_map)
+            # Store results
+            results.append({
+                "image_path": image_path,
+                "object_depths": object_depths
+            })
+            # Print some information for debugging
+            print(f"Analyzed {image_path}:")
+            for obj in object_depths:
+                print(f"  - {obj['original_label']} (Zone: {obj['zone_description']})")
+        return results

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import gc
+from pathlib import Path
+import gradio as gr
+import torch
+import torchaudio
+from config import LOGS_DIR, OUTPUT_DIR
+from SoundMapper import SoundMapper
+from GenerateAudio import GenerateAudio
+from GenerateCaptions import generate_caption
+from audio_mixer import compose_audio
+# Ensure required directories exist
+os.makedirs(LOGS_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Prepare external model dir and download checkpoint if missing
+from pathlib import Path
+depthfm_ckpt = Path('external_models/depth-fm/checkpoints/depthfm-v1.ckpt')
+if not depthfm_ckpt.exists():
+    depthfm_ckpt.parent.mkdir(parents=True, exist_ok=True)
+    os.system('wget https://ommer-lab.com/files/depthfm/depthfm-v1.ckpt -P external_models/depth-fm/checkpoints/')
+# Clear CUDA cache between runs
+def clear_cuda():
+    torch.cuda.empty_cache()
+    gc.collect()
+def process_images(
+    image_dir: str,
+    output_dir: str,
+    panoramic: bool,
+    view: str,
+    model: str,
+    location: str,
+    audio_duration: int,
+    cpu_only: bool
+) -> None:
+    # Existing processing logic, generates files in OUTPUT_DIR
+    lat, lon = location.split(",")
+    os.makedirs(output_dir, exist_ok=True)
+    sound_mapper = SoundMapper()
+    audio_generator = GenerateAudio()
+    if panoramic:
+        # Panoramic: generate per-view audio then composition
+        view_results = generate_caption(lat, lon, view=view, model=model,
+                                        cpu_only=cpu_only, panoramic=True)
+        processed_maps = sound_mapper.process_depth_maps()
+        image_paths = sorted(Path(image_dir).glob("*.jpg"))
+        audios = {}
+        for vr in view_results:
+            cv = vr["view"]
+            img_file = Path(image_dir) / f"{cv}.jpg"
+            if not img_file.exists():
+                continue
+            idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
+            if not idx:
+                continue
+            depth_map = processed_maps[idx[0]]["normalization"]
+            obj_depths = sound_mapper.analyze_object_depths(
+                str(img_file), depth_map, lat, lon,
+                caption_data=vr, all_objects=False
+            )
+            if not obj_depths:
+                continue
+            out_wav = Path(output_dir) / f"sound_{cv}.wav"
+            audio, sr = audio_generator.process_and_generate_audio(
+                obj_depths, duration=audio_duration
+            )
+            if audio.dim() == 3:
+                audio = audio.squeeze(0)
+            elif audio.dim() == 1:
+                audio = audio.unsqueeze(0)
+            torchaudio.save(str(out_wav), audio, sr)
+            audios[cv] = str(out_wav)
+        # final panoramic composition
+        comp = Path(output_dir) / "panoramic_composition.wav"
+        compose_audio(list(audios.values()), [1.0]*len(audios), str(comp))
+        audios['panorama'] = str(comp)
+        clear_cuda()
+        return
+    # Single-view: generate one audio
+    vr = generate_caption(lat, lon, view=view, model=model,
+                          cpu_only=cpu_only, panoramic=False)
+    img_file = Path(image_dir) / f"{view}.jpg"
+    processed_maps = sound_mapper.process_depth_maps()
+    image_paths = sorted(Path(image_dir).glob("*.jpg"))
+    idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
+    depth_map = processed_maps[idx[0]]["normalization"]
+    obj_depths = sound_mapper.analyze_object_depths(
+        str(img_file), depth_map, lat, lon,
+        caption_data=vr, all_objects=True
+    )
+    out_wav = Path(output_dir) / f"sound_{view}.wav"
+    audio, sr = audio_generator.process_and_generate_audio(obj_depths, duration=audio_duration)
+    if audio.dim() == 3:
+        audio = audio.squeeze(0)
+    elif audio.dim() == 1:
+        audio = audio.unsqueeze(0)
+    torchaudio.save(str(out_wav), audio, sr)
+    clear_cuda()
+# Gradio UI
+demo = gr.Blocks(title="Panoramic Audio Generator")
+with demo:
+    gr.Markdown("""
+    # Panoramic Audio Generator
+    Displays each view with its audio side by side.
+    """
+    )
+    with gr.Row():
+        panoramic = gr.Checkbox(label="Panoramic (multi-view)", value=False)
+        view = gr.Dropdown(["front", "back", "left", "right"], value="front", label="View")
+        location = gr.Textbox(value="52.3436723,4.8529625", label="Location (lat,lon)")
+        # model = gr.Textbox(value="intern_2_5-4B", label="Vision-Language Model")
+        model = "intern_2_5-4B"
+        audio_duration = gr.Slider(1, 60, value=10, step=1, label="Audio Duration (sec)")
+        cpu_only = gr.Checkbox(label="CPU Only", value=False)
+        btn = gr.Button("Generate")
+    # Output layout: two rows of two
+    with gr.Row():
+        with gr.Column():
+            img_front = gr.Image(label="Front View", type="filepath")
+            aud_front = gr.Audio(label="Front Audio", type="filepath")
+        with gr.Column():
+            img_back = gr.Image(label="Back View", type="filepath")
+            aud_back = gr.Audio(label="Back Audio", type="filepath")
+    with gr.Row():
+        with gr.Column():
+            img_left = gr.Image(label="Left View", type="filepath")
+            aud_left = gr.Audio(label="Left Audio", type="filepath")
+        with gr.Column():
+            img_right = gr.Image(label="Right View", type="filepath")
+            aud_right = gr.Audio(label="Right Audio", type="filepath")
+    # Panorama at bottom
+    img_pan = gr.Image(label="Panorama View", type="filepath")
+    aud_pan = gr.Audio(label="Panoramic Audio", type="filepath")
+    # Preview update
+    def run_all(pan, vw, loc, mdl, dur, cpu):
+        # generate files
+        process_images(LOGS_DIR, OUTPUT_DIR, pan, vw, mdl, loc, dur, cpu)
+        # collect files
+        views = ["front", "back", "left", "right", "panorama"]
+        paths = {}
+        for v in views:
+            img = Path(LOGS_DIR) / f"{v}.jpg"
+            audio = Path(OUTPUT_DIR) / ("panoramic_composition.wav" if v == "panorama" else f"sound_{v}.wav")
+            paths[v] = {
+                'img': str(img) if img.exists() else None,
+                'aud': str(audio) if audio.exists() else None
+            }
+        return (
+            paths['front']['img'], paths['front']['aud'],
+            paths['back']['img'],  paths['back']['aud'],
+            paths['left']['img'],  paths['left']['aud'],
+            paths['right']['img'], paths['right']['aud'],
+            paths['panorama']['img'], paths['panorama']['aud']
+        )
+    btn.click(
+        fn=run_all,
+        inputs=[panoramic, view, location, model, audio_duration, cpu_only],
+        outputs=[
+            img_front, aud_front,
+            img_back, aud_back,
+            img_left, aud_left,
+            img_right, aud_right,
+            img_pan, aud_pan
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

audio_mixer.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import numpy as np
+import torch
+import torchaudio
+import torchaudio.transforms as T
+import matplotlib.pyplot as plt
+import os
+from typing import List, Tuple
+from config import LOGS_DIR
+##Some utils:
+def load_audio_files(file_paths: List[str]) -> List[Tuple[torch.Tensor, int]]:
+    """
+    Load multiple audio files and ensure they have the same length.
+    Args:
+        file_paths: List of paths to audio files
+    Returns:
+        List of tuples containing audio data and sample rate
+    """
+    audio_data = []
+    for path in file_paths:
+        # Load audio file
+        waveform, sample_rate = torchaudio.load(path)
+        # Convert to mono if stereo
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        audio_data.append((waveform.squeeze(), sample_rate))
+    # Verify all audio files have the same length and sample rate
+    lengths = [len(audio) for audio, _ in audio_data]
+    sample_rates = [sr for _, sr in audio_data]
+    if len(set(lengths)) > 1:
+        raise ValueError(f"Audio files have different lengths: {lengths}")
+    if len(set(sample_rates)) > 1:
+        raise ValueError(f"Audio files have different sample rates: {sample_rates}")
+    return audio_data
+def normalize_audio_volumes(audio_data: List[Tuple[torch.Tensor, int]]) -> List[Tuple[torch.Tensor, int]]:
+    """
+    Normalize the volume of each audio file to have the same energy level.
+    Args:
+        audio_data: List of tuples containing audio data and sample rate
+    Returns:
+        List of tuples containing normalized audio data and sample rate
+    """
+    normalized_data = []
+    # Calculate RMS (Root Mean Square) for each audio
+    rms_values = []
+    for audio, sr in audio_data:
+        # Calculate energy (squared amplitude)
+        energy = torch.mean(audio ** 2)
+        # Calculate RMS (square root of mean energy)
+        rms = torch.sqrt(energy)
+        rms_values.append(rms)
+    # Find the target RMS (we'll use the median to avoid outliers)
+    target_rms = torch.median(torch.tensor(rms_values))
+    # Normalize each audio to the target RMS
+    for (audio, sr), rms in zip(audio_data, rms_values):
+        if rms > 0:  # Avoid division by zero
+            # Calculate scaling factor
+            scaling_factor = target_rms / rms
+            # Apply scaling
+            normalized_audio = audio * scaling_factor
+        else:
+            normalized_audio = audio
+        normalized_data.append((normalized_audio, sr))
+    return normalized_data
+def plot_energy_comparison(original_metrics: List[dict], normalized_metrics: List[dict], file_names: List[str], output_path: str = "./logs/energy_comparison.png") -> None:
+    """
+    Plot a comparison of energy metrics before and after normalization.
+    Args:
+        original_metrics: List of dictionaries containing metrics for original audio
+        normalized_metrics: List of dictionaries containing metrics for normalized audio
+        file_names: List of audio file names
+        output_path: Path to save the plot
+    """
+    fig, axs = plt.subplots(2, 2, figsize=(14, 10))
+    # Extract metrics
+    orig_rms = [m['rms'] for m in original_metrics]
+    norm_rms = [m['rms'] for m in normalized_metrics]
+    orig_peak = [m['peak'] for m in original_metrics]
+    norm_peak = [m['peak'] for m in normalized_metrics]
+    orig_dr = [m['dynamic_range_db'] for m in original_metrics]
+    norm_dr = [m['dynamic_range_db'] for m in normalized_metrics]
+    orig_cf = [m['crest_factor'] for m in original_metrics]
+    norm_cf = [m['crest_factor'] for m in normalized_metrics]
+    # Prepare x-axis
+    x = np.arange(len(file_names))
+    width = 0.35
+    # Plot RMS (volume)
+    axs[0, 0].bar(x - width/2, orig_rms, width, label='Original')
+    axs[0, 0].bar(x + width/2, norm_rms, width, label='Normalized')
+    axs[0, 0].set_title('RMS Energy (Volume)')
+    axs[0, 0].set_xticks(x)
+    axs[0, 0].set_xticklabels(file_names, rotation=45, ha='right')
+    axs[0, 0].set_ylabel('RMS Value')
+    axs[0, 0].legend()
+    # Plot Peak Amplitude
+    axs[0, 1].bar(x - width/2, orig_peak, width, label='Original')
+    axs[0, 1].bar(x + width/2, norm_peak, width, label='Normalized')
+    axs[0, 1].set_title('Peak Amplitude')
+    axs[0, 1].set_xticks(x)
+    axs[0, 1].set_xticklabels(file_names, rotation=45, ha='right')
+    axs[0, 1].set_ylabel('Peak Value')
+    axs[0, 1].legend()
+    # Plot Dynamic Range
+    axs[1, 0].bar(x - width/2, orig_dr, width, label='Original')
+    axs[1, 0].bar(x + width/2, norm_dr, width, label='Normalized')
+    axs[1, 0].set_title('Dynamic Range (dB)')
+    axs[1, 0].set_xticks(x)
+    axs[1, 0].set_xticklabels(file_names, rotation=45, ha='right')
+    axs[1, 0].set_ylabel('dB')
+    axs[1, 0].legend()
+    # Plot Crest Factor
+    axs[1, 1].bar(x - width/2, orig_cf, width, label='Original')
+    axs[1, 1].bar(x + width/2, norm_cf, width, label='Normalized')
+    axs[1, 1].set_title('Crest Factor (Peak-to-RMS Ratio)')
+    axs[1, 1].set_xticks(x)
+    axs[1, 1].set_xticklabels(file_names, rotation=45, ha='right')
+    axs[1, 1].set_ylabel('Ratio')
+    axs[1, 1].legend()
+    plt.tight_layout()
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    # Save the plot
+    plt.savefig(output_path)
+    plt.close()
+def calculate_audio_metrics(audio_data: List[Tuple[torch.Tensor, int]]) -> List[dict]:
+    """
+    Calculate various audio metrics for each audio file.
+    Args:
+        audio_data: List of tuples containing audio data and sample rate
+    Returns:
+        List of dictionaries containing metrics
+    """
+    metrics = []
+    for audio, sr in audio_data:
+        # Calculate RMS (Root Mean Square)
+        energy = torch.mean(audio ** 2)
+        rms = torch.sqrt(energy)
+        # Calculate peak amplitude
+        peak = torch.max(torch.abs(audio))
+        # Calculate dynamic range
+        if torch.min(torch.abs(audio[audio != 0])) > 0:
+            min_non_zero = torch.min(torch.abs(audio[audio != 0]))
+            dynamic_range_db = 20 * torch.log10(peak / min_non_zero)
+        else:
+            dynamic_range_db = torch.tensor(float('inf'))
+        # Calculate crest factor (peak to RMS ratio)
+        crest_factor = peak / rms if rms > 0 else torch.tensor(float('inf'))
+        metrics.append({
+            'rms': rms.item(),
+            'peak': peak.item(),
+            'dynamic_range_db': dynamic_range_db.item() if not torch.isinf(dynamic_range_db) else float('inf'),
+            'crest_factor': crest_factor.item() if not torch.isinf(crest_factor) else float('inf')
+        })
+    return metrics
+def create_weighted_composite(
+    audio_data: List[Tuple[torch.Tensor, int]],
+    weights: List[float]
+) -> torch.Tensor:
+    """
+    Create a weighted composite of multiple audio files.
+    Args:
+        audio_data: List of tuples containing audio data and sample rate
+        weights: List of weights for each audio file
+    Returns:
+        Weighted composite audio data
+    """
+    if len(audio_data) != len(weights):
+        raise ValueError("Number of audio files and weights must match")
+    # Normalize weights to sum to 1
+    weights = torch.tensor(weights) / sum(weights)
+    # Initialize composite audio with zeros
+    composite = torch.zeros_like(audio_data[0][0])
+    # Add weighted audio data
+    for (audio, _), weight in zip(audio_data, weights):
+        composite += audio * weight
+    # Normalize to prevent clipping
+    max_val = torch.max(torch.abs(composite))
+    if max_val > 1.0:
+        composite = composite / max_val
+    return composite
+def create_melspectrograms(
+    audio_data: List[Tuple[torch.Tensor, int]],
+    composite: torch.Tensor,
+    sr: int
+) -> List[torch.Tensor]:
+    """
+    Create melspectrograms for individual audio files and the composite.
+    Args:
+        audio_data: List of tuples containing audio data and sample rate
+        composite: Composite audio data
+        sr: Sample rate
+    Returns:
+        List of melspectrogram data
+    """
+    specs = []
+    # Create mel spectrogram transform
+    mel_transform = T.MelSpectrogram(
+        sample_rate=sr,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        f_max=8000
+    )
+    # Generate spectrograms for individual audio files
+    for audio, _ in audio_data:
+        melspec = mel_transform(audio)
+        specs.append(melspec)
+    # Generate spectrogram for composite audio
+    composite_melspec = mel_transform(composite)
+    specs.append(composite_melspec)
+    return specs
+def plot_melspectrograms(
+    specs: List[torch.Tensor],
+    sr: int,
+    file_names: List[str],
+    weights: List[float],
+    output_path: str = "melspectrograms.png"
+) -> None:
+    """
+    Plot melspectrograms for individual audio files and the composite.
+    Args:
+        specs: List of melspectrogram data
+        sr: Sample rate
+        file_names: List of audio file names
+        weights: List of weights for each audio file
+        output_path: Path to save the plot
+    """
+    fig, axs = plt.subplots(len(specs), 1, figsize=(12, 4 * len(specs)))
+    # Create labels for the plots
+    labels = [f"{name} (weight: {weight:.2f})" for name, weight in zip(file_names, weights)]
+    labels.append("Composite.wav")
+    # Convert to dB scale (similar to librosa's power_to_db)
+    def power_to_db(spec):
+        return 10 * torch.log10(spec + 1e-10)
+    # Plot each melspectrogram
+    for i, (spec, label) in enumerate(zip(specs, labels)):
+        spec_db = power_to_db(spec).numpy().squeeze()
+        # For single subplot case
+        if len(specs) == 1:
+            ax = axs
+        else:
+            ax = axs[i]
+        img = ax.imshow(
+            spec_db,
+            aspect='auto',
+            origin='lower',
+            interpolation='none',
+            extent=[0, spec_db.shape[1], 0, sr/2]
+        )
+        ax.set_title(label)
+        ax.set_ylabel('Frequency (Hz)')
+        ax.set_xlabel('Time Frames')
+    # No colorbar as requested
+    plt.tight_layout()
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    # Save the plot
+    plt.savefig(output_path,dpi=300)
+    plt.close()
+def compose_audio(
+    file_paths: List[str],
+    weights: List[float],
+    output_audio_path: str = os.path.join(LOGS_DIR, "composite.wav"),
+    output_plot_path: str = os.path.join(LOGS_DIR, "plot/melspectrograms.png"),
+    energy_plot_path: str = os.path.join(LOGS_DIR, "plot/energy_comparison.png")
+) -> None:
+    """
+    Main function to process audio files and create visualizations.
+    Args:
+        file_paths: List of paths to audio files (supports 4 audio files)
+        weights: List of weights for each audio file
+        output_audio_path: Path to save the composite audio
+        output_plot_path: Path to save the melspectrogram plot
+        energy_plot_path: Path to save the energy comparison plot
+    """
+    # Load audio files
+    audio_data = load_audio_files(file_paths)
+    # # Calculate metrics for original audio
+    print("Calculating metrics for original audio...")
+    original_metrics = calculate_audio_metrics(audio_data)
+    # Normalize audio volumes to have same energy level
+    print("Normalizing audio volumes...")
+    normalized_audio_data = normalize_audio_volumes(audio_data)
+    # Calculate metrics for normalized audio
+    print("Calculating metrics for normalized audio...")
+    normalized_metrics = calculate_audio_metrics(normalized_audio_data)
+    # Print energy comparison
+    print("\nAudio Energy Comparison (RMS values):")
+    print("-" * 50)
+    print(f"{'File':<20} {'Original':<15} {'Normalized':<15} {'Scaling Factor':<15}")
+    print("-" * 50)
+    for i, path in enumerate(file_paths):
+        file_name = path.split("/")[-1]
+        orig_rms = original_metrics[i]['rms']
+        norm_rms = normalized_metrics[i]['rms']
+        scaling = norm_rms / orig_rms if orig_rms > 0 else float('inf')
+        print(f"{file_name[:20]:<20} {orig_rms:<15.6f} {norm_rms:<15.6f} {scaling:<15.6f}")
+    # Create energy comparison plot
+    print("\nCreating energy comparison plot...")
+    file_names = [path.split("/")[-1] for path in file_paths]
+    plot_energy_comparison(original_metrics, normalized_metrics, file_names, energy_plot_path)
+    # Get sample rate (all files have the same sample rate)
+    sr = normalized_audio_data[0][1]
+    # Create weighted composite
+    print("\nCreating weighted composite...")
+    composite = create_weighted_composite(normalized_audio_data, weights)
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_audio_path) or '.', exist_ok=True)
+    # Save composite audio
+    print("Saving composite audio...")
+    torchaudio.save(output_audio_path, composite.unsqueeze(0), sr)
+    # Create melspectrograms for normalized audio (not original)
+    print("Creating melspectrograms for normalized audio...")
+    specs = create_melspectrograms(normalized_audio_data, composite, sr)
+    # Get file names without path
+    labeled_file_names = [path.split("/")[-1] for path in file_paths]
+    # Plot melspectrograms
+    print("Plotting melspectrograms...")
+    plot_melspectrograms(specs, sr, labeled_file_names, weights, output_plot_path)
+    print(f"\nComposite audio saved to {output_audio_path}")
+    print(f"Melspectrograms saved to {output_plot_path}")
+    print(f"Energy comparison saved to {energy_plot_path}")
+    print(f"Composite audio saved to {output_audio_path}")
+    print(f"Melspectrograms saved to {output_plot_path}")
+# if __name__ == "__main__":
+#     import argparse
+#     parser = argparse.ArgumentParser(description="Mix audio files with weights and create melspectrograms")
+#     parser.add_argument("--files", nargs="+", required=True, help="Paths to audio files")
+#     parser.add_argument("--weights", nargs="+", type=float, required=True, help="Weights for each audio file")
+#     parser.add_argument("--output-audio", default="./logs/composite.wav", help="Path to save the composite audio")
+#     parser.add_argument("--output-plot", default="./logs/melspectrograms.png", help="Path to save the melspectrogram plot")
+#     args = parser.parse_args()
+#     os.makedirs("./logs", exist_ok=True)
+#     main(args.files, args.weights, args.output_audio, args.output_plot)
+# Example usage:
+# python audio_mixer.py --files audio1.wav audio2.wav audio3.wav audio4.wav --weights 0.4 0.3 0.2 0.1

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+# Base directories
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+LOGS_DIR = os.path.join(BASE_DIR, "logs")
+OUTPUT_DIR = os.path.join(BASE_DIR, "output")
+# Model paths
+EXTERNAL_MODELS_DIR = os.path.join(BASE_DIR, "external_models")
+DEPTH_FM_DIR = os.path.join(EXTERNAL_MODELS_DIR, "depth-fm")
+DEPTH_FM_CHECKPOINT = os.path.join(DEPTH_FM_DIR, "checkpoints/depthfm-v1.ckpt") # You will need to download the checkpoint manually. Here is the link: https://github.com/CompVis/depth-fm/tree/main/checkpoints
+TANGO_FLUX_DIR = os.path.join(EXTERNAL_MODELS_DIR, "TangoFlux")
+# Create required directories
+os.makedirs(LOGS_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)

environment.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: geosynthsound
+channels:
+- conda-forge
+- defaults
+dependencies:
+- python=3.10
+- pip:
+  - -r requirements.txt

external_models/TangoFlux/.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+.DS_Store
+*.wav

external_models/TangoFlux/Demo.ipynb ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/OIEIEIO/TangoFlux/blob/main/Demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "xiaRzuzPOP4H"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install git+https://github.com/declare-lab/TangoFlux.git"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "Hfu3zXTDOP4J"
+      },
+      "outputs": [],
+      "source": [
+        "import IPython\n",
+        "import torchaudio\n",
+        "from tangoflux import TangoFluxInference\n",
+        "from IPython.display import Audio\n",
+        "\n",
+        "model = TangoFluxInference(name='declare-lab/TangoFlux')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oFiak5QIOP4K"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Generate Audio\n",
+        "\n",
+        "prompt = 'a futuristic space craft with unique engine sound' # @param {type:\"string\"}\n",
+        "duration = 10 # @param {type:\"number\"}\n",
+        "steps = 50 # @param {type:\"number\"}\n",
+        "\n",
+        "audio = model.generate(prompt, steps=steps, duration=duration)\n",
+        "\n",
+        "Audio(data=audio, rate=44100)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import IPython\n",
+        "import torchaudio\n",
+        "from tangoflux import TangoFluxInference\n",
+        "from IPython.display import Audio\n",
+        "\n",
+        "model = TangoFluxInference(name='declare-lab/TangoFlux')\n",
+        "\n",
+        "# @title Generate Audio\n",
+        "prompt = 'Melodic human whistling harmonizing with natural birdsong'  # @param {type:\"string\"}\n",
+        "duration = 10  # @param {type:\"number\"}\n",
+        "steps = 50  # @param {type:\"number\"}\n",
+        "\n",
+        "# Generate the audio\n",
+        "audio = model.generate(prompt, steps=steps, duration=duration)\n",
+        "\n",
+        "# Ensure audio is in the correct format (2D Tensor: [channels, samples])\n",
+        "if len(audio.shape) == 1:  # If mono audio (1D tensor)\n",
+        "    audio_tensor = audio.unsqueeze(0)  # Add channel dimension to make it [1, samples]\n",
+        "elif len(audio.shape) == 2:  # Stereo audio (2D tensor)\n",
+        "    audio_tensor = audio  # Already in correct format\n",
+        "else:\n",
+        "    raise ValueError(f\"Unexpected audio tensor shape: {audio.shape}\")\n",
+        "\n",
+        "# Save the audio as a .wav file\n",
+        "torchaudio.save('generated_audio.wav', audio_tensor, sample_rate=44100)\n",
+        "\n",
+        "# Optionally play the audio in the notebook\n",
+        "Audio(data=audio.numpy(), rate=44100)\n"
+      ],
+      "metadata": {
+        "id": "_Z8elHyOHOQ1"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "language_info": {
+      "name": "python"
+    },
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "private_outputs": true,
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

external_models/TangoFlux/Inference.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

external_models/TangoFlux/LICENSE.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# LICENSE
+## 1. Model & License Summary
+This repository contains **TangoFlux** (the “Model”) created for **non-commercial, research-only** purposes under the **UK data copyright exemption**. The Model is subject to:
+1. The **Stability AI Community License Agreement**, provided in the file ```STABILITY_AI_COMMUNITY_LICENSE.md```.
+2. The **WavCaps** license requirement: **only academic uses** are permitted for data sourced from WavCaps.
+3. The **original licenses** of the datasets used in training.
+By using or distributing this Model, you **agree** to adhere to all applicable licenses and restrictions, as summarized below.
+---
+## 2. Stability AI Community License Requirements
+- You must comply with the **Stability AI Community License Agreement** (the “Agreement”) for any usage, distribution, or modification of this Model.
+- **Non-Commercial Use**: This Model is for research and academic purposes only. Any commercial usage requires registering with Stability AI or obtaining a separate commercial license.
+- **Attribution & Notice**:
+  - Retain the notice:
+    ```
+    This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved.
+    ```
+  - Clearly display “Powered by Stability AI” if you build upon or showcase this Model.
+- **Disclaimer & Liability**: This Model is provided **“AS IS”** with **no warranties**. Neither we nor Stability AI will be liable for any claim or damages related to Model use.
+See ```STABILITY_AI_COMMUNITY_LICENSE.md``` for the full text.
+---
+## 3. WavCaps & Dataset Usage
+- **Academic-Only for WavCaps**: By accessing any WavCaps-sourced data (including audio clips via provided links), you agree to use them **strictly for non-commercial, academic research** in accordance with WavCaps’ terms.
+- **WavCaps Audio**: Each WavCaps audio subset has its own license terms. **You** are responsible for reviewing and complying with those licenses, including attribution requirements on your end.
+---
+## 4. UK Data Copyright Exemption
+This Model was developed under the **UK data copyright exemption for non-commercial research**. Distribution or use outside these bounds must **not** violate that exemption or infringe on any underlying dataset’s license.
+---
+## 5. Further Information
+- **Stability AI License Terms**: <https://stability.ai/community-license>
+- **WavCaps License**: <https://github.com/XinhaoMei/WavCaps?tab=readme-ov-file#license>
+---
+**End of License**.

external_models/TangoFlux/Notice ADDED Viewed

	@@ -0,0 +1 @@


1	+ This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved

external_models/TangoFlux/README.md ADDED Viewed

	@@ -0,0 +1,188 @@

+<div align="center">
+  <img src="assets/tf_opener.png" alt="TangoFluxOpener" width="1000" />
+  <br/>
+  [![arXiv](https://img.shields.io/badge/Read_the_Paper-blue?link=https%3A%2F%2Fopenreview.net%2Fattachment%3Fid%3DtpJPlFTyxd%26name%3Dpdf)](https://arxiv.org/abs/2412.21037) [![Static Badge](https://img.shields.io/badge/TangoFlux-Hugging_Face-violet?logo=huggingface&link=https%3A%2F%2Fhuggingface.co%2Fdeclare-lab%2FTangoFlux)](https://huggingface.co/declare-lab/TangoFlux) [![Static Badge](https://img.shields.io/badge/Demos-declare--lab-brightred?style=flat)](https://tangoflux.github.io/) [![Static Badge](https://img.shields.io/badge/TangoFlux-Hugging_Face_Space-8A2BE2?logo=huggingface&link=https%3A%2F%2Fhuggingface.co%2Fspaces%2Fdeclare-lab%2FTangoFlux)](https://huggingface.co/spaces/declare-lab/TangoFlux) [![Static Badge](https://img.shields.io/badge/TangoFlux_Dataset-Hugging_Face-red?logo=huggingface&link=https%3A%2F%2Fhuggingface.co%2Fdatasets%2Fdeclare-lab%2FTangoFlux)](https://huggingface.co/datasets/declare-lab/CRPO) [![Replicate](https://replicate.com/declare-lab/tangoflux/badge)](https://replicate.com/declare-lab/tangoflux)
+  <img src="assets/tf_teaser.png" alt="TangoFlux" width="1000" />
+  <br/>
+</div>
+* Powered by **Stability AI**
+## News
+> 📣 1/3/25: We have released CRPO dataset as well as the script to perform CRPO dataset generation!
+## Demos
+[![Hugging Face Space](https://img.shields.io/badge/Hugging_Face_Space-TangoFlux-blue?logo=huggingface&link=https%3A%2F%2Fhuggingface.co%2Fspaces%2Fdeclare-lab%2FTangoFlux)](https://huggingface.co/spaces/declare-lab/TangoFlux)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/declare-lab/TangoFlux/blob/main/Demo.ipynb)
+## Overall Pipeline
+TangoFlux consists of FluxTransformer blocks, which are Diffusion Transformers (DiT) and Multimodal Diffusion Transformers (MMDiT) conditioned on a textual prompt and a duration embedding to generate a 44.1kHz audio up to 30 seconds long. TangoFlux learns a rectified flow trajectory to an audio latent representation encoded by a variational autoencoder (VAE). TangoFlux training pipeline consists of three stages: pre-training, fine-tuning, and preference optimization with CRPO. CRPO, particularly, iteratively generates new synthetic data and constructs preference pairs for preference optimization using DPO loss for flow matching.
+![cover-photo](assets/tangoflux.png)
+🚀 **TangoFlux can generate 44.1kHz stereo audio up to 30 seconds in ~3 seconds on a single A40 GPU.**
+## Installation
+```bash
+pip install git+https://github.com/declare-lab/TangoFlux
+```
+## Inference
+TangoFlux can generate audio up to 30 seconds long. You must pass a duration to the `model.generate` function when using the Python API. Please note that duration should be between 1 and 30.
+### Web Interface
+Run the following command to start the web interface:
+```bash
+tangoflux-demo
+```
+### CLI
+Use the CLI to generate audio from text.
+```bash
+tangoflux "Hammer slowly hitting the wooden table" output.wav --duration 10 --steps 50
+```
+### Python API
+```python
+import torchaudio
+from tangoflux import TangoFluxInference
+model = TangoFluxInference(name='declare-lab/TangoFlux')
+audio = model.generate('Hammer slowly hitting the wooden table', steps=50, duration=10)
+torchaudio.save('output.wav', audio, 44100)
+```
+### [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
+> This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface.
+Check [this](https://github.com/LucipherDev/ComfyUI-TangoFlux) repo for the TangoFlux custom node for *ComfyUI*. (Thanks to [LucipherDev](https://github.com/LucipherDev))
+Our evaluation shows that inference with 50 steps yields the best results. A CFG scale of 3.5, 4, and 4.5 yield similar quality output. Inference with 25 steps yields similar audio quality at a faster speed.
+## Training
+We use the `accelerate` package from Hugging Face for multi-GPU training. Run `accelerate config` to setup your run configuration. The default accelerate config is in the `configs` folder. Please specify the path to your training files in the `configs/tangoflux_config.yaml`. Samples of `train.json` and `val.json` have been provided. Replace them with your own audio.
+`tangoflux_config.yaml` defines the training file paths and model hyperparameters:
+```bash
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' tangoflux/train.py   --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml'
+```
+To perform DPO training, modify the training files such that each data point contains "chosen", "reject", "caption" and "duration" fields. Please specify the path to your training files in `configs/tangoflux_config.yaml`. An example has been provided in `train_dpo.json`. Replace it with your own audio.
+```bash
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' tangoflux/train_dpo.py   --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml'
+```
+## Evaluation
+### TangoFlux vs. Other Audio Generation Models
+This key comparison metrics include:
+- **Output Length**: Represents the duration of the generated audio.
+- **FD**<sub>openl3</sub>: Fréchet Distance.
+- **KL**<sub>passt</sub>: KL divergence.
+- **CLAP**<sub>score</sub>: Alignment score.
+All the inference times are observed on the same A40 GPU. The counts of trainable parameters are reported in the **\#Params** column.
+| Model | Params | Duration | Steps | FD<sub>openl3</sub> ↓ | KL<sub>passt</sub> ↓ | CLAP<sub>score</sub> ↑ | IS ↑ | Inference Time (s) |
+|---|---|---|---|---|---|---|---|---|
+| **AudioLDM 2 (Large)** | 712M | 10 sec | 200 | 108.3 | 1.81 | 0.419 | 7.9 | 24.8 |
+| **Stable Audio Open** | 1056M | 47 sec | 100 | 89.2 | 2.58 | 0.291 | 9.9 | 8.6 |
+| **Tango 2** | 866M | 10 sec | 200 | 108.4 | 1.11 | 0.447 | 9.0 | 22.8 |
+| **TangoFlux (Base)** | 515M | 30 sec | 50 | 80.2 | 1.22 | 0.431 | 11.7 | 3.7 |
+| **TangoFlux** | 515M | 30 sec | 50 | 75.1 | 1.15 | 0.480 | 12.2 | 3.7 |
+## CRPO dataset generation
+There are 2  py files for CRPO dataset generation.
+tangoflux/generate_crpo.py generates the crpo dataset by providing path to prompt bank and model weights. You can specify the sample size as well as number of samples per prompt for crpo in the arguments.
+tangoflux/label_crpo.py labels the generated audio and construct preference pairs. This will also create a train.json in the output dir that can be passed into train_dpo.py
+You can follow the example in crpo.sh which will generate crpo dataset, then perform reward labelling to generate the train.json
+To run CRPO for multiple iteration, you can simply repeat the above the process multiple time through setting the correct model weight.
+## Citation
+```bibtex
+@misc{hung2024tangofluxsuperfastfaithful,
+      title={TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization},
+      author={Chia-Yu Hung and Navonil Majumder and Zhifeng Kong and Ambuj Mehrish and Amir Zadeh and Chuan Li and Rafael Valle and Bryan Catanzaro and Soujanya Poria},
+      year={2024},
+      eprint={2412.21037},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2412.21037},
+}
+```
+## LICENSE
+### 1. Model & License Summary
+This repository contains **TangoFlux** (the “Model”) created for **non-commercial, research-only** purposes under the **UK data copyright exemption**. The Model is subject to:
+1. The **Stability AI Community License Agreement**, provided in the file ```STABILITY_AI_COMMUNITY_LICENSE.md```.
+2. The **WavCaps** license requirement: **only academic uses** are permitted for data sourced from WavCaps.
+3. The **original licenses** of the datasets used in training.
+By using or distributing this Model, you **agree** to adhere to all applicable licenses and restrictions, as summarized below.
+---
+### 2. Stability AI Community License Requirements
+- You must comply with the **Stability AI Community License Agreement** (the “Agreement”) for any usage, distribution, or modification of this Model.
+- **Non-Commercial Use**: This Model is for research and academic purposes only. Any commercial usage requires registering with Stability AI or obtaining a separate commercial license.
+- **Attribution & Notice**:
+  - Retain the notice:
+    ```
+    This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved.
+    ```
+  - Clearly display “Powered by Stability AI” if you build upon or showcase this Model.
+- **Disclaimer & Liability**: This Model is provided **“AS IS”** with **no warranties**. Neither we nor Stability AI will be liable for any claim or damages related to Model use.
+See ```STABILITY_AI_COMMUNITY_LICENSE.md``` for the full text.
+---
+### 3. WavCaps & Dataset Usage
+- **Academic-Only for WavCaps**: By accessing any WavCaps-sourced data (including audio clips via provided links), you agree to use them **strictly for non-commercial, academic research** in accordance with WavCaps’ terms.
+- **WavCaps Audio**: Each WavCaps audio subset has its own license terms. **You** are responsible for reviewing and complying with those licenses, including attribution requirements on your end.
+---
+### 4. UK Data Copyright Exemption
+This Model was developed under the **UK data copyright exemption for non-commercial research**. Distribution or use outside these bounds must **not** violate that exemption or infringe on any underlying dataset’s license.
+---
+### 5. Further Information
+- **Stability AI License Terms**: <https://stability.ai/community-license>
+- **WavCaps License**: <https://github.com/XinhaoMei/WavCaps?tab=readme-ov-file#license>
+---
+**End of License**.

external_models/TangoFlux/STABILITY_AI_COMMUNITY_LICENSE.md ADDED Viewed

	@@ -0,0 +1,57 @@

+STABILITY AI COMMUNITY LICENSE AGREEMENT
+Last Updated: July 5, 2024
+1. INTRODUCTION
+This Agreement applies to any individual person or entity (“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials  or Derivative Works thereof for any Research & Non-Commercial or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
+This Agreement is intended to allow research, non-commercial, and limited commercial uses of the Models free of charge. In order to ensure that certain limited commercial uses of the Models continue to be allowed, this Agreement  preserves free access to the Models for people or organizations  generating annual revenue of less than US $1,000,000 (or local currency equivalent).
+By clicking “I Accept”  or by using or distributing or using any portion or element of the Stability Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then “You” includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity’s behalf.
+2. RESEARCH & NON-COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose. “Research Purpose” means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. “Non-Commercial Purpose” means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
+3. COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement (including the remainder of this Section III), Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Commercial Purpose. “Commercial Purpose” means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for commercial advantage or monetary compensation to You or others, including but not limited to, (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, and (ii) for Your business’s or organization’s internal operations.
+If You are using or distributing the Stability AI Materials for a Commercial Purpose, You must register with Stability AI at (https://stability.ai/community-license). If at any time You or Your Affiliate(s), either individually or in aggregate, generate more than USD $1,000,000 in annual revenue (or the equivalent thereof in Your local currency), regardless of whether that revenue is generated directly or indirectly from the Stability AI Materials or Derivative Works, any licenses granted to You under this Agreement shall terminate as of such date. You must request a license from Stability AI at (https://stability.ai/enterprise) , which Stability AI may grant to You in its sole discretion. If you receive Stability AI Materials, or any Derivative Works thereof, from a Licensee as part of an integrated end user product, then Section III of this Agreement will not apply to you.
+4. GENERAL TERMS
+Your Research, Non-Commercial, and Commercial License(s) under this Agreement are subject to the following terms.
+a.  Distribution & Attribution. If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Community License, Copyright ©  Stability AI Ltd. All Rights Reserved”, and (iii) prominently display “Powered by Stability AI” on a related website, user interface, blogpost, about page, or product documentation.  If You create a Derivative Work, You may add your own attribution notice(s) to the “Notice” text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Stability AI Materials and state in the “Notice” text file that You changed the Stability AI Materials and how it was modified.
+b.  Use Restrictions. Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby incorporated by reference. Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).
+c.  Intellectual Property.
+(i) Trademark License.  No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of its Affiliates, except as required under Section IV(a) herein.
+(ii)  Ownership of Derivative Works.  As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
+(iii)  Ownership of Outputs. As between You and Stability AI, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.
+(iv)  Disputes.  If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of this Agreement.
+(v)  Feedback.  From time to time, You may provide Stability AI with verbal and/or written suggestions, comments or other feedback related to Stability AI’s existing or prospective technology, products or services (collectively, “Feedback”). You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant Stability AI a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided “AS IS” and You make no warranties whatsoever about any Feedback.
+d.  Disclaimer Of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE STABILITY AI MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE STABILITY AI MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE STABILITY AI MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
+e.  Limitation Of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+f.  Term And Termination. The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Stability AI Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Stability AI Materials or Derivative Works. Section IV(d), (e), and (g) shall survive the termination of this Agreement.
+g.  Governing Law.  This Agreement will be governed by and constructed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
+5. DEFINITIONS
+“Affiliate(s)” means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, “control” means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
+"Agreement" means this Stability AI Community License Agreement.
+“AUP” means the Stability AI Acceptable Use Policy available at (https://stability.ai/use-policy), as may be updated from time to time.
+"Derivative Work(s)” means (a) any derivative work of the Stability AI Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output, including “fine tune” and “low-rank adaptation” models derived from a Model or a Model’s output, but do not include the output of any Model.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software or Models.
+“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing listed on Stability’s Core Models Webpage available at (https://stability.ai/core-models), as may be updated from time to time.
+"Stability AI" or "we" means Stability AI Ltd. and its Affiliates.
+"Software" means Stability AI’s proprietary software made available under this Agreement now or in the future.
+“Stability AI Materials” means, collectively, Stability’s proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
+“Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

external_models/TangoFlux/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+try:
+    from .comfyui import *
+except:
+    pass

external_models/TangoFlux/assets/tangoflux.png ADDED Viewed

Git LFS Details

SHA256: e8e19e12b3c2c991a29987d7fceaed80aa8ed306827cfaa0894d666b5c250702
Pointer size: 131 Bytes
Size of remote file: 304 kB

external_models/TangoFlux/assets/tf_opener.png ADDED Viewed

Git LFS Details

SHA256: 58934ca2300804d67bc73c7116c3a0d956d770e0bd6e816aa9dbe9034f5b32fe
Pointer size: 131 Bytes
Size of remote file: 465 kB

external_models/TangoFlux/assets/tf_teaser.png ADDED Viewed

Git LFS Details

SHA256: 475a101c58ee8cb7481172d24763fddcc1da59f578aaeccf9d8052f5a86401b6
Pointer size: 131 Bytes
Size of remote file: 778 kB

external_models/TangoFlux/comfyui/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# ComfyUI-TangoFlux
+ComfyUI Custom Nodes for ["TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching"](https://arxiv.org/abs/2412.21037). These nodes, adapted from [the official implementations](https://github.com/declare-lab/TangoFlux/), generates high-quality 44.1kHz audio up to 30 seconds using just a text promptproduction.
+## Installation
+1. Navigate to your ComfyUI's custom_nodes directory:
+```bash
+cd ComfyUI/custom_nodes
+```
+2. Clone this repository:
+```bash
+git clone https://github.com/declare-lab/TangoFlux  ComfyUI-TangoFlux
+```
+3. Install requirements:
+```bash
+cd ComfyUI-TangoFlux/comfyui
+python install.py
+```
+### Or Install via ComfyUI Manager
+#### Check out some demos from [the official demo page](https://tangoflux.github.io/)
+## Example Workflow
+![example_workflow](https://github.com/user-attachments/assets/afbf7b53-d712-4c9c-a538-53f0dc001f45)
+## Usage
+**All the necessary models should be automatically downloaded when the TangoFluxLoader node is used for the first time.**
+**Models can also be downloaded using the `install.py` script**
+![models_folder_structure](https://github.com/user-attachments/assets/94d8a54a-10d6-4f90-bb4d-3ee181dee3a2)
+**Manual Download:**
+- Download TangoFlux from [here](https://huggingface.co/declare-lab/TangoFlux/tree/main) into `models/tangoflux`
+- Download text encoders from [here](https://huggingface.co/google/flan-t5-large/tree/main) into `models/text_encoders/google-flan-t5-large`
+*(Include Everything as shown in the screenshot above. Do Not Rename Anything)*
+The nodes can be found in "TangoFlux" category as `TangoFluxLoader`, `TangoFluxSampler`, `TangoFluxVAEDecodeAndPlay`.
+![teacache_options](https://github.com/user-attachments/assets/29e676d9-902b-4ea2-9f72-18d3607996e8)
+> [TeaCache](https://github.com/LiewFeng/TeaCache) can speedup TangoFlux 2x without much audio quality degradation, in a training-free manner.
+>
+>
+> ## 📈 Inference Latency Comparisons on a Single A800
+>
+>
+> |      TangoFlux      |        TeaCache (0.25)       |    TeaCache (0.4)    |
+> |:-------------------:|:----------------------------:|:--------------------:|
+> |      ~4.08 s        |        ~2.42 s                |     ~1.95 s         |
+## Citation
+```bibtex
+@misc{hung2024tangofluxsuperfastfaithful,
+      title={TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching and Clap-Ranked Preference Optimization},
+      author={Chia-Yu Hung and Navonil Majumder and Zhifeng Kong and Ambuj Mehrish and Rafael Valle and Bryan Catanzaro and Soujanya Poria},
+      year={2024},
+      eprint={2412.21037},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2412.21037},
+}
+```
+```
+@article{liu2024timestep,
+  title={Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model},
+  author={Liu, Feng and Zhang, Shiwei and Wang, Xiaofeng and Wei, Yujie and Qiu, Haonan and Zhao, Yuzhong and Zhang, Yingya and Ye, Qixiang and Wan, Fang},
+  journal={arXiv preprint arXiv:2411.19108},
+  year={2024}
+}
+```

external_models/TangoFlux/comfyui/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .nodes import NODE_CLASS_MAPPINGS
+from .server import *
+WEB_DIRECTORY = "./comfyui/web"
+__all__ = ["NODE_CLASS_MAPPINGS", "WEB_DIRECTORY"]

external_models/TangoFlux/comfyui/example_workflow.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "last_node_id": 13,
+  "last_link_id": 15,
+  "nodes": [
+    {
+      "id": 10,
+      "type": "TangoFluxLoader",
+      "pos": [
+        380,
+        320
+      ],
+      "size": [
+        210,
+        102
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "TANGOFLUX_MODEL",
+          "links": [
+            11
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "vae",
+          "type": "TANGOFLUX_VAE",
+          "links": [
+            15
+          ],
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "TangoFluxLoader"
+      },
+      "widgets_values": [
+        false,
+        0.25
+      ]
+    },
+    {
+      "id": 13,
+      "type": "TangoFluxVAEDecodeAndPlay",
+      "pos": [
+        1060,
+        320
+      ],
+      "size": [
+        315,
+        126
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "vae",
+          "type": "TANGOFLUX_VAE",
+          "link": 15
+        },
+        {
+          "name": "latents",
+          "type": "TANGOFLUX_LATENTS",
+          "link": 14
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "TangoFluxVAEDecodeAndPlay"
+      },
+      "widgets_values": [
+        "TangoFlux",
+        "wav",
+        true
+      ]
+    },
+    {
+      "id": 11,
+      "type": "TangoFluxSampler",
+      "pos": [
+        620,
+        320
+      ],
+      "size": [
+        400,
+        220
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "TANGOFLUX_MODEL",
+          "link": 11
+        }
+      ],
+      "outputs": [
+        {
+          "name": "latents",
+          "type": "TANGOFLUX_LATENTS",
+          "links": [
+            14
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "TangoFluxSampler"
+      },
+      "widgets_values": [
+        "A dog barking near the ocean, ocean waves crashing.",
+        50,
+        3,
+        10,
+        106139285587780,
+        "randomize",
+        1
+      ]
+    }
+  ],
+  "links": [
+    [
+      11,
+      10,
+      0,
+      11,
+      0,
+      "TANGOFLUX_MODEL"
+    ],
+    [
+      14,
+      11,
+      0,
+      13,
+      1,
+      "TANGOFLUX_LATENTS"
+    ],
+    [
+      15,
+      10,
+      1,
+      13,
+      0,
+      "TANGOFLUX_VAE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.9480295566502464,
+      "offset": [
+        -200.83333333333337,
+        -102.2460379319304
+      ]
+    },
+    "node_versions": {
+      "comfyui-tangoflux": "1.0.4"
+    }
+  },
+  "version": 0.4
+}

external_models/TangoFlux/comfyui/install.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import sys
+import os
+import logging
+import subprocess
+import traceback
+import json
+import re
+log = logging.getLogger("TangoFlux")
+download_models = True
+EXT_PATH = os.path.dirname(os.path.abspath(__file__))
+try:
+    folder_paths_path = os.path.abspath(os.path.join(EXT_PATH, "..", "..", "..", "folder_paths.py"))
+    sys.path.append(os.path.dirname(folder_paths_path))
+    import folder_paths
+    TANGOFLUX_DIR = os.path.join(folder_paths.models_dir, "tangoflux")
+    TEXT_ENCODER_DIR = os.path.join(folder_paths.models_dir, "text_encoders")
+except:
+    download_models = False
+try:
+    log.info("Installing requirements")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", f"{EXT_PATH}/requirements.txt", "--no-warn-script-location"])
+    if download_models:
+        from huggingface_hub import snapshot_download
+        log.info("Downloading Necessary models")
+        try:
+            log.info(f"Downloading TangoFlux models to: {TANGOFLUX_DIR}")
+            snapshot_download(
+                repo_id="declare-lab/TangoFlux",
+                allow_patterns=["*.json", "*.safetensors"],
+                local_dir=TANGOFLUX_DIR,
+                local_dir_use_symlinks=False,
+            )
+        except Exception:
+            traceback.print_exc()
+            log.error("Failed to download TangoFlux models")
+        log.info("Loading config")
+        with open(os.path.join(TANGOFLUX_DIR, "config.json"), "r") as f:
+            config = json.load(f)
+        try:
+            text_encoder = re.sub(r'[<>:"/\\|?*]', '-', config.get("text_encoder_name", "google/flan-t5-large"))
+            text_encoder_path = os.path.join(TEXT_ENCODER_DIR, text_encoder)
+            log.info(f"Downloading text encoders to: {text_encoder_path}")
+            snapshot_download(
+                repo_id=config.get("text_encoder_name", "google/flan-t5-large"),
+                allow_patterns=["*.json", "*.safetensors", "*.model"],
+                local_dir=text_encoder_path,
+                local_dir_use_symlinks=False,
+            )
+        except Exception:
+            traceback.print_exc()
+            log.error("Failed to download text encoders")
+    try:
+        log.info("Installing TangoFlux module")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", os.path.join(EXT_PATH, "..")])
+    except Exception:
+        traceback.print_exc()
+        log.error("Failed to install TangoFlux module")
+    log.info("TangoFlux Installation completed")
+except Exception:
+    traceback.print_exc()
+    log.error("TangoFlux Installation failed")

external_models/TangoFlux/comfyui/nodes.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import os
+import logging
+import json
+import random
+import torch
+import torchaudio
+import re
+from diffusers import AutoencoderOobleck, FluxTransformer2DModel
+from huggingface_hub import snapshot_download
+from comfy.utils import load_torch_file, ProgressBar
+import folder_paths
+from tangoflux.model import TangoFlux
+from .teacache import teacache_forward
+log = logging.getLogger("TangoFlux")
+TANGOFLUX_DIR = os.path.join(folder_paths.models_dir, "tangoflux")
+if "tangoflux" not in folder_paths.folder_names_and_paths:
+    current_paths = [TANGOFLUX_DIR]
+else:
+    current_paths, _ = folder_paths.folder_names_and_paths["tangoflux"]
+folder_paths.folder_names_and_paths["tangoflux"] = (
+    current_paths,
+    folder_paths.supported_pt_extensions,
+)
+TEXT_ENCODER_DIR = os.path.join(folder_paths.models_dir, "text_encoders")
+class TangoFluxLoader:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "enable_teacache": ("BOOLEAN", {"default": False}),
+                "rel_l1_thresh": (
+                    "FLOAT",
+                    {"default": 0.25, "min": 0.0, "max": 10.0, "step": 0.01},
+                ),
+            },
+        }
+    RETURN_TYPES = ("TANGOFLUX_MODEL", "TANGOFLUX_VAE")
+    RETURN_NAMES = ("model", "vae")
+    OUTPUT_TOOLTIPS = ("TangoFlux Model", "TangoFlux Vae")
+    CATEGORY = "TangoFlux"
+    FUNCTION = "load_tangoflux"
+    DESCRIPTION = "Load TangoFlux model"
+    def __init__(self):
+        self.model = None
+        self.vae = None
+        self.enable_teacache = False
+        self.rel_l1_thresh = 0.25
+        self.original_forward = FluxTransformer2DModel.forward
+    def load_tangoflux(
+        self,
+        enable_teacache=False,
+        rel_l1_thresh=0.25,
+        tangoflux_path=TANGOFLUX_DIR,
+        text_encoder_path=TEXT_ENCODER_DIR,
+        device="cuda",
+    ):
+        if self.model is None or self.enable_teacache != enable_teacache:
+            pbar = ProgressBar(6)
+            snapshot_download(
+                repo_id="declare-lab/TangoFlux",
+                allow_patterns=["*.json", "*.safetensors"],
+                local_dir=tangoflux_path,
+                local_dir_use_symlinks=False,
+            )
+            pbar.update(1)
+            log.info("Loading config")
+            with open(os.path.join(tangoflux_path, "config.json"), "r") as f:
+                config = json.load(f)
+            pbar.update(1)
+            text_encoder = re.sub(
+                r'[<>:"/\\|?*]',
+                "-",
+                config.get("text_encoder_name", "google/flan-t5-large"),
+            )
+            text_encoder_path = os.path.join(text_encoder_path, text_encoder)
+            snapshot_download(
+                repo_id=config.get("text_encoder_name", "google/flan-t5-large"),
+                allow_patterns=["*.json", "*.safetensors", "*.model"],
+                local_dir=text_encoder_path,
+                local_dir_use_symlinks=False,
+            )
+            pbar.update(1)
+            log.info("Loading TangoFlux models")
+            del self.model
+            self.model = None
+            model_weights = load_torch_file(
+                os.path.join(tangoflux_path, "tangoflux.safetensors"),
+                device=torch.device(device),
+            )
+            pbar.update(1)
+            if enable_teacache:
+                log.info("Enabling TeaCache")
+                FluxTransformer2DModel.forward = teacache_forward
+            else:
+                log.info("Disabling TeaCache")
+                FluxTransformer2DModel.forward = self.original_forward
+            model = TangoFlux(config=config, text_encoder_dir=text_encoder_path)
+            model.load_state_dict(model_weights, strict=False)
+            model.to(device)
+            if enable_teacache:
+                model.transformer.__class__.enable_teacache = True
+                model.transformer.__class__.cnt = 0
+                model.transformer.__class__.rel_l1_thresh = rel_l1_thresh
+                model.transformer.__class__.accumulated_rel_l1_distance = 0
+                model.transformer.__class__.previous_modulated_input = None
+                model.transformer.__class__.previous_residual = None
+            pbar.update(1)
+            self.model = model
+            del model
+            self.enable_teacache = enable_teacache
+            self.rel_l1_thresh = rel_l1_thresh
+            if self.vae is None:
+                log.info("Loading TangoFlux VAE")
+                vae_weights = load_torch_file(
+                    os.path.join(tangoflux_path, "vae.safetensors")
+                )
+                self.vae = AutoencoderOobleck()
+                self.vae.load_state_dict(vae_weights)
+                self.vae.to(device)
+            pbar.update(1)
+        if self.enable_teacache == True and self.rel_l1_thresh != rel_l1_thresh:
+            self.model.transformer.__class__.rel_l1_thresh = rel_l1_thresh
+            self.rel_l1_thresh = rel_l1_thresh
+        return (self.model, self.vae)
+class TangoFluxSampler:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "model": ("TANGOFLUX_MODEL",),
+                "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+                "steps": ("INT", {"default": 50, "min": 1, "max": 10000, "step": 1}),
+                "guidance_scale": (
+                    "FLOAT",
+                    {"default": 3, "min": 1, "max": 100, "step": 1},
+                ),
+                "duration": ("INT", {"default": 10, "min": 1, "max": 30, "step": 1}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}),
+                "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+            },
+        }
+    RETURN_TYPES = ("TANGOFLUX_LATENTS",)
+    RETURN_NAMES = ("latents",)
+    OUTPUT_TOOLTIPS = "TangoFlux Sample"
+    CATEGORY = "TangoFlux"
+    FUNCTION = "sample"
+    DESCRIPTION = "Sampler for TangoFlux"
+    def sample(
+        self,
+        model,
+        prompt,
+        steps=50,
+        guidance_scale=3,
+        duration=10,
+        seed=0,
+        batch_size=1,
+        device="cuda",
+    ):
+        pbar = ProgressBar(steps)
+        with torch.no_grad():
+            model.to(device)
+            try:
+                if model.transformer.__class__.enable_teacache:
+                    model.transformer.__class__.num_steps = steps
+            except:
+                pass
+            log.info("Generating latents with TangoFlux")
+            latents = model.inference_flow(
+                prompt,
+                duration=duration,
+                num_inference_steps=steps,
+                guidance_scale=guidance_scale,
+                seed=seed,
+                num_samples_per_prompt=batch_size,
+                callback_on_step_end=lambda: pbar.update(1),
+            )
+        return ({"latents": latents, "duration": duration},)
+class TangoFluxVAEDecodeAndPlay:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "vae": ("TANGOFLUX_VAE",),
+                "latents": ("TANGOFLUX_LATENTS",),
+                "filename_prefix": ("STRING", {"default": "TangoFlux"}),
+                "format": (
+                    ["wav", "mp3", "flac", "aac", "wma"],
+                    {"default": "wav"},
+                ),
+                "save_output": ("BOOLEAN", {"default": True}),
+            },
+        }
+    RETURN_TYPES = ()
+    OUTPUT_NODE = True
+    CATEGORY = "TangoFlux"
+    FUNCTION = "play"
+    DESCRIPTION = "Decoder and Player for TangoFlux"
+    def decode(self, vae, latents):
+        results = []
+        for latent in latents:
+            decoded = vae.decode(latent.unsqueeze(0).transpose(2, 1)).sample.cpu()
+            results.append(decoded)
+        results = torch.cat(results, dim=0)
+        return results
+    def play(
+        self,
+        vae,
+        latents,
+        filename_prefix="TangoFlux",
+        format="wav",
+        save_output=True,
+        device="cuda",
+    ):
+        audios = []
+        pbar = ProgressBar(len(latents) + 2)
+        if save_output:
+            output_dir = folder_paths.get_output_directory()
+            prefix_append = ""
+            type = "output"
+        else:
+            output_dir = folder_paths.get_temp_directory()
+            prefix_append = "_temp_" + "".join(
+                random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)
+            )
+            type = "temp"
+        filename_prefix += prefix_append
+        full_output_folder, filename, counter, subfolder, _ = (
+            folder_paths.get_save_image_path(filename_prefix, output_dir)
+        )
+        os.makedirs(full_output_folder, exist_ok=True)
+        pbar.update(1)
+        duration = latents["duration"]
+        latents = latents["latents"]
+        vae.to(device)
+        log.info("Decoding Tangoflux latents")
+        waves = self.decode(vae, latents)
+        pbar.update(1)
+        for wave in waves:
+            waveform_end = int(duration * vae.config.sampling_rate)
+            wave = wave[:, :waveform_end]
+            file = f"{filename}_{counter:05}_.{format}"
+            torchaudio.save(
+                os.path.join(full_output_folder, file), wave, sample_rate=44100
+            )
+            counter += 1
+            audios.append({"filename": file, "subfolder": subfolder, "type": type})
+            pbar.update(1)
+        return {
+            "ui": {"audios": audios},
+        }
+NODE_CLASS_MAPPINGS = {
+    "TangoFluxLoader": TangoFluxLoader,
+    "TangoFluxSampler": TangoFluxSampler,
+    "TangoFluxVAEDecodeAndPlay": TangoFluxVAEDecodeAndPlay,
+}

external_models/TangoFlux/comfyui/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torchaudio
+torchlibrosa
+torchvision
+diffusers
+accelerate
+datasets
+librosa
+wandb
+tqdm

external_models/TangoFlux/comfyui/server.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import server
+import folder_paths
+web = server.web
+@server.PromptServer.instance.routes.get("/tangoflux/playaudio")
+async def play_audio(request):
+    query = request.rel_url.query
+    filename = query.get("filename", None)
+    if filename is None:
+        return web.Response(status=404)
+    if filename[0] == "/" or ".." in filename:
+        return web.Response(status=403)
+    filename, output_dir = folder_paths.annotated_filepath(filename)
+    if not output_dir:
+        file_type = query.get("type", "output")
+        output_dir = folder_paths.get_directory_by_type(file_type)
+    if output_dir is None:
+        return web.Response(status=400)
+    subfolder = query.get("subfolder", None)
+    if subfolder:
+        full_output_dir = os.path.join(output_dir, subfolder)
+        if os.path.commonpath((os.path.abspath(full_output_dir), output_dir)) != output_dir:
+            return web.Response(status=403)
+        output_dir = full_output_dir
+    filename = os.path.basename(filename)
+    file_path = os.path.join(output_dir, filename)
+    if not os.path.isfile(file_path):
+        return web.Response(status=404)
+    _, ext = os.path.splitext(filename)
+    ext = ext.lower()
+    content_types = {
+        ".wav": "audio/wav",
+        ".mp3": "audio/mpeg",
+        ".flac": "audio/flac",
+        ".aac": "audio/aac",
+        ".wma": "audio/x-ms-wma",
+    }
+    content_type = content_types.get(ext, None)
+    if content_type is None:
+        return web.Response(status=400)
+    try:
+        with open(file_path, "rb") as file:
+            data = file.read()
+    except:
+        return web.Response(status=500)
+    return web.Response(body=data, content_type=content_type)

external_models/TangoFlux/comfyui/teacache.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# Code from https://github.com/ali-vilab/TeaCache/blob/main/TeaCache4TangoFlux/teacache_tango_flux.py
+from typing import Any, Dict, Optional, Union
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_version,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+import torch
+import numpy as np
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def teacache_forward(
+    self,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    pooled_projections: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_ids: torch.Tensor = None,
+    txt_ids: torch.Tensor = None,
+    guidance: torch.Tensor = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    return_dict: bool = True,
+) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+    """
+    The [`FluxTransformer2DModel`] forward method.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+            Input `hidden_states`.
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+        pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+            from the embeddings of input conditions.
+        timestep ( `torch.LongTensor`):
+            Used to indicate denoising step.
+        block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+            A list of tensors that if specified are added to the residuals of transformer blocks.
+        joint_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+            tuple.
+    Returns:
+        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+        `tuple` where the first element is the sample tensor.
+    """
+    if joint_attention_kwargs is not None:
+        joint_attention_kwargs = joint_attention_kwargs.copy()
+        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if (
+            joint_attention_kwargs is not None
+            and joint_attention_kwargs.get("scale", None) is not None
+        ):
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    hidden_states = self.x_embedder(hidden_states)
+    timestep = timestep.to(hidden_states.dtype) * 1000
+    if guidance is not None:
+        guidance = guidance.to(hidden_states.dtype) * 1000
+    else:
+        guidance = None
+    temb = (
+        self.time_text_embed(timestep, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(timestep, guidance, pooled_projections)
+    )
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+    ids = torch.cat((txt_ids, img_ids), dim=1)
+    image_rotary_emb = self.pos_embed(ids)
+    if self.enable_teacache:
+        inp = hidden_states.clone()
+        temb_ = temb.clone()
+        modulated_inp, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.transformer_blocks[0].norm1(inp, emb=temb_)
+        )
+        if self.cnt == 0 or self.cnt == self.num_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = [
+                4.98651651e02,
+                -2.83781631e02,
+                5.58554382e01,
+                -3.82021401e00,
+                2.64230861e-01,
+            ]
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(
+                (
+                    (modulated_inp - self.previous_modulated_input).abs().mean()
+                    / self.previous_modulated_input.abs().mean()
+                )
+                .cpu()
+                .item()
+            )
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.cnt += 1
+        if self.cnt == self.num_steps:
+            self.cnt = 0
+    if self.enable_teacache:
+        if not should_calc:
+            hidden_states += self.previous_residual
+        else:
+            ori_hidden_states = hidden_states.clone()
+            for index_block, block in enumerate(self.transformer_blocks):
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = (
+                        {"use_reentrant": False}
+                        if is_torch_version(">=", "1.11.0")
+                        else {}
+                    )
+                    encoder_hidden_states, hidden_states = (
+                        torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            hidden_states,
+                            encoder_hidden_states,
+                            temb,
+                            image_rotary_emb,
+                            **ckpt_kwargs,
+                        )
+                    )
+                else:
+                    encoder_hidden_states, hidden_states = block(
+                        hidden_states=hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            for index_block, block in enumerate(self.single_transformer_blocks):
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = (
+                        {"use_reentrant": False}
+                        if is_torch_version(">=", "1.11.0")
+                        else {}
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        temb,
+                        image_rotary_emb,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states = block(
+                        hidden_states=hidden_states,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+            hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+            self.previous_residual = hidden_states - ori_hidden_states
+    else:
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                encoder_hidden_states, hidden_states = (
+                    torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        encoder_hidden_states,
+                        temb,
+                        image_rotary_emb,
+                        **ckpt_kwargs,
+                    )
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+    hidden_states = self.norm_out(hidden_states, temb)
+    output = self.proj_out(hidden_states)
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)

external_models/TangoFlux/comfyui/web/js/playAudio.js ADDED Viewed

	@@ -0,0 +1,59 @@

+import { app } from "../../../scripts/app.js";
+import { api } from "../../../scripts/api.js";
+app.registerExtension({
+  name: "TangoFlux.playAudio",
+  async beforeRegisterNodeDef(nodeType, nodeData, app) {
+    if (nodeData.name === "TangoFluxVAEDecodeAndPlay") {
+      const originalNodeCreated = nodeType.prototype.onNodeCreated;
+      nodeType.prototype.onNodeCreated = async function () {
+        originalNodeCreated?.apply(this, arguments);
+        this.widgets_count = this.widgets?.length || 0;
+        this.addAudioWidgets = (audios) => {
+          if (this.widgets) {
+              for (let i = 0; i < this.widgets.length; i++) {
+                  if (this.widgets[i].name.startsWith("_playaudio")) {
+                      this.widgets[i].onRemove?.();
+                  }
+              }
+              this.widgets.length = this.widgets_count;
+          }
+          let index = 0
+          for (const params of audios) {
+              const audioElement = document.createElement("audio");
+              audioElement.controls = true;
+              this.addDOMWidget("_playaudio" + index, "playaudio", audioElement, {
+                serialize: false,
+                hideOnZoom: false,
+              });
+              audioElement.src = api.apiURL(
+                `/tangoflux/playaudio?${new URLSearchParams(params)}`
+              );
+              index++
+          }
+          requestAnimationFrame(() => {
+            const newSize = this.computeSize();
+            newSize[0] = Math.max(newSize[0], this.size[0]);
+            newSize[1] = Math.max(newSize[1], this.size[1]);
+            this.onResize?.(newSize);
+            app.graph.setDirtyCanvas(true, false);
+          });
+        };
+      };
+      const originalNodeExecuted = nodeType.prototype.onExecuted;
+      nodeType.prototype.onExecuted = async function (message) {
+        originalNodeExecuted?.apply(this, arguments);
+        if (message?.audios) {
+          this.addAudioWidgets(message.audios);
+        }
+      };
+    }
+  },
+});

external_models/TangoFlux/configs/__init__.py ADDED Viewed

File without changes

external_models/TangoFlux/configs/accelerator_config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "compute_environment": "LOCAL_MACHINE",
+  "distributed_type": "MULTI_GPU",
+  "main_process_port": 29512,
+  "downcast_bf16": false,
+  "machine_rank": 0,
+  "gpu_ids": "0,1",
+  "main_training_function": "main",
+  "mixed_precision": "no",
+  "num_machines": 1,
+  "num_processes": 2,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}

external_models/TangoFlux/configs/tangoflux_config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# Absolute paths for different resources
+paths:
+  train_file: "data/train.json"
+  val_file: "data/val.json"
+  test_file: "data/val.json"
+  resume_from_checkpoint: ""
+  output_dir: "outputs/"
+# Training-related parameters
+training:
+  per_device_batch_size: 4
+  learning_rate: 5e-4
+  gradient_accumulation_steps: 1
+  num_train_epochs: 80
+  num_warmup_steps: 1000
+  max_audio_duration: 30
+# Model and optimizer parameters,
+model:
+  num_layers: 6
+  num_single_layers: 18
+  in_channels: 64
+  attention_head_dim: 128
+  joint_attention_dim: 1024
+  num_attention_heads: 8
+  audio_seq_len: 645
+  max_duration: 30
+  uncondition: false
+  text_encoder_name: "google/flan-t5-large"

external_models/TangoFlux/crpo.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python3 tangoflux/generate_crpo.py --json_path='path_to_prompt_bank.json' --sample_size=50 --model='path_to_tangoflux.safetensors' --num_samples=5 --output_dir='outputs'
2	+ python3 tangoflux/label_crpo.py --json_path='outputs/results.json' --output_dir='outputs/crpo_iteration1' --num_samples=5

external_models/TangoFlux/inference.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torchaudio
+from tangoflux import TangoFluxInference
+model = TangoFluxInference(name="declare-lab/TangoFlux")
+audio = model.generate("Hammer slowly hitting the wooden table", steps=50, duration=10)
+torchaudio.save("output.wav", audio, sample_rate=44100)

external_models/TangoFlux/replicate_demo/cog.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# Configuration for Cog ⚙️
+# Reference: https://cog.run/yaml
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  # python version in the form '3.11' or '3.11.4'
+  python_version: "3.11"
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - torch==2.4.0
+    - torchaudio==2.4.0
+    - torchlibrosa==0.1.0
+    - torchvision==0.19.0
+    - transformers==4.44.0
+    - diffusers==0.30.0
+    - accelerate==0.34.2
+    - datasets==2.21.0
+    - librosa
+    - ipython
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
+predict: "predict.py:Predictor"

external_models/TangoFlux/replicate_demo/predict.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+import os
+import subprocess
+import time
+import json
+from cog import BasePredictor, Input, Path
+from diffusers import AutoencoderOobleck
+import soundfile as sf
+from safetensors.torch import load_file
+from huggingface_hub import snapshot_download
+from tangoflux.model import TangoFlux
+from tangoflux import TangoFluxInference
+MODEL_CACHE = "model_cache"
+MODEL_URL = (
+    "https://weights.replicate.delivery/default/declare-lab/TangoFlux/model_cache.tar"
+)
+class CachedTangoFluxInference(TangoFluxInference):
+    ## load the weights from replicate.delivery for faster booting
+    def __init__(self, name="declare-lab/TangoFlux", device="cuda", cached_paths=None):
+        if cached_paths:
+            paths = cached_paths
+        else:
+            paths = snapshot_download(repo_id=name)
+        self.vae = AutoencoderOobleck()
+        vae_weights = load_file(f"{paths}/vae.safetensors")
+        self.vae.load_state_dict(vae_weights)
+        weights = load_file(f"{paths}/tangoflux.safetensors")
+        with open(f"{paths}/config.json", "r") as f:
+            config = json.load(f)
+        self.model = TangoFlux(config)
+        self.model.load_state_dict(weights, strict=False)
+        self.vae.to(device)
+        self.model.to(device)
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        if not os.path.exists(MODEL_CACHE):
+            print("downloading")
+            download_weights(MODEL_URL, MODEL_CACHE)
+        self.model = CachedTangoFluxInference(
+            cached_paths=f"{MODEL_CACHE}/declare-lab/TangoFlux"
+        )
+    def predict(
+        self,
+        prompt: str = Input(
+            description="Input prompt", default="Hammer slowly hitting the wooden table"
+        ),
+        duration: int = Input(
+            description="Duration of the output audio in seconds", default=10
+        ),
+        steps: int = Input(
+            description="Number of inference steps", ge=1, le=200, default=25
+        ),
+        guidance_scale: float = Input(
+            description="Scale for classifier-free guidance", ge=1, le=20, default=4.5
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+        audio = self.model.generate(
+            prompt,
+            steps=steps,
+            guidance_scale=guidance_scale,
+            duration=duration,
+        )
+        audio_numpy = audio.numpy()
+        out_path = "/tmp/out.wav"
+        sf.write(
+            out_path, audio_numpy.T, samplerate=self.model.vae.config.sampling_rate
+        )
+        return Path(out_path)

external_models/TangoFlux/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch==2.4.0
+torchaudio==2.4.0
+torchlibrosa==0.1.0
+torchvision==0.19.0
+transformers==4.44.0
+diffusers==0.30.0
+accelerate==0.34.2
+datasets==2.21.0
+librosa
+tqdm
+wandb

external_models/TangoFlux/setup.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from setuptools import setup
+setup(
+    name="tangoflux",
+    description="TangoFlux: Super Fast and Faithful Text to Audio Generation with Flow Matching",
+    version="0.1.0",
+    packages=["tangoflux"],
+    install_requires=[
+        "torch==2.4.0",
+        "torchaudio==2.4.0",
+        "torchlibrosa==0.1.0",
+        "torchvision==0.19.0",
+        "transformers==4.44.0",
+        "diffusers==0.30.0",
+        "accelerate==0.34.2",
+        "datasets==2.21.0",
+        "librosa",
+        "tqdm",
+        "wandb",
+        "click",
+        "gradio",
+        "torchaudio",
+    ],
+    entry_points={
+        "console_scripts": [
+            "tangoflux=tangoflux.cli:main",
+            "tangoflux-demo=tangoflux.demo:main",
+        ],
+    },
+)

external_models/TangoFlux/tangoflux/__init__.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from diffusers import AutoencoderOobleck
+import torch
+from transformers import T5EncoderModel, T5TokenizerFast
+from diffusers import FluxTransformer2DModel
+from torch import nn
+from typing import List
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling
+import copy
+import torch.nn.functional as F
+import numpy as np
+from tangoflux.model import TangoFlux
+from huggingface_hub import snapshot_download
+from tqdm import tqdm
+from typing import Optional, Union, List
+from datasets import load_dataset, Audio
+from math import pi
+import json
+import inspect
+import yaml
+from safetensors.torch import load_file
+class TangoFluxInference:
+    def __init__(
+        self,
+        name="declare-lab/TangoFlux",
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ):
+        self.vae = AutoencoderOobleck()
+        paths = snapshot_download(repo_id=name)
+        vae_weights = load_file("{}/vae.safetensors".format(paths))
+        self.vae.load_state_dict(vae_weights)
+        weights = load_file("{}/tangoflux.safetensors".format(paths))
+        with open("{}/config.json".format(paths), "r") as f:
+            config = json.load(f)
+        self.model = TangoFlux(config)
+        self.model.load_state_dict(weights, strict=False)
+        # _IncompatibleKeys(missing_keys=['text_encoder.encoder.embed_tokens.weight'], unexpected_keys=[]) this behaviour is expected
+        self.vae.to(device)
+        self.model.to(device)
+    def generate(self, prompt, steps=25, duration=10, guidance_scale=4.5):
+        with torch.no_grad():
+            latents = self.model.inference_flow(
+                prompt,
+                duration=duration,
+                num_inference_steps=steps,
+                guidance_scale=guidance_scale,
+            )
+            wave = self.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
+        waveform_end = int(duration * self.vae.config.sampling_rate)
+        wave = wave[:, :waveform_end]
+        return wave

external_models/TangoFlux/tangoflux/cli.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import click
+import torchaudio
+from tangoflux import TangoFluxInference
+@click.command()
+@click.argument('prompt')
+@click.argument('output_file')
+@click.option('--duration', default=10, type=int, help='Duration in seconds (1-30)')
+@click.option('--steps', default=50, type=int, help='Number of inference steps (10-100)')
+def main(prompt: str, output_file: str, duration: int, steps: int):
+    """Generate audio from text using TangoFlux.
+    Args:
+        prompt: Text description of the audio to generate
+        output_file: Path to save the generated audio file
+        duration: Duration of generated audio in seconds (default: 10)
+        steps: Number of inference steps (default: 50)
+    """
+    if not 1 <= duration <= 30:
+        raise click.BadParameter('Duration must be between 1 and 30 seconds')
+    if not 10 <= steps <= 100:
+        raise click.BadParameter('Steps must be between 10 and 100')
+    model = TangoFluxInference(name="declare-lab/TangoFlux")
+    audio = model.generate(prompt, steps=steps, duration=duration)
+    torchaudio.save(output_file, audio, sample_rate=44100)
+if __name__ == '__main__':
+    main()

external_models/TangoFlux/tangoflux/demo.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+import torchaudio
+import click
+import tempfile
+from tangoflux import TangoFluxInference
+model = TangoFluxInference(name="declare-lab/TangoFlux")
+def generate_audio(prompt, duration, steps):
+    audio = model.generate(prompt, steps=steps, duration=duration)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        torchaudio.save(f.name, audio, sample_rate=44100)
+        return f.name
+examples = [
+    ["Hammer slowly hitting the wooden table", 10, 50],
+    ["Gentle rain falling on a tin roof", 15, 50],
+    ["Wind chimes tinkling in a light breeze", 10, 50],
+    ["Rhythmic wooden table tapping overlaid with steady water pouring sound", 10, 50],
+]
+with gr.Blocks(title="TangoFlux Text-to-Audio Generation") as demo:
+    gr.Markdown("# TangoFlux Text-to-Audio Generation")
+    gr.Markdown("Generate audio from text descriptions using TangoFlux")
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(
+                label="Text Prompt", placeholder="Enter your audio description..."
+            )
+            duration = gr.Slider(
+                minimum=1, maximum=30, value=10, step=1, label="Duration (seconds)"
+            )
+            steps = gr.Slider(
+                minimum=10, maximum=100, value=50, step=10, label="Number of Steps"
+            )
+            generate_btn = gr.Button("Generate Audio")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Audio")
+    generate_btn.click(
+        fn=generate_audio, inputs=[prompt, duration, steps], outputs=audio_output
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=[prompt, duration, steps],
+        outputs=audio_output,
+        fn=generate_audio,
+    )
+@click.command()
+@click.option('--host', default='127.0.0.1', help='Host to bind to')
+@click.option('--port', default=None, help='Port to bind to')
+@click.option('--share', is_flag=True, help='Enable sharing via Gradio')
+def main(host, port, share):
+    demo.queue().launch(server_name=host, server_port=port, share=share)
+if __name__ == "__main__":
+    main()

external_models/TangoFlux/tangoflux/generate_crpo_dataset.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import json
+import time
+import torch
+import argparse
+import multiprocessing
+from tqdm import tqdm
+from safetensors.torch import load_file
+from diffusers import AutoencoderOobleck
+import soundfile as sf
+from model import TangoFlux
+import random
+def generate_audio_chunk(args, chunk, gpu_id, output_dir, samplerate, return_dict, process_id):
+    """
+    Function to generate audio for a chunk of text prompts on a specific GPU.
+    """
+    try:
+        device = f"cuda:{gpu_id}"
+        torch.cuda.set_device(device)
+        print(f"Process {process_id}: Using device {device}")
+        # Initialize model
+        config = {
+        'num_layers': 6,
+        'num_single_layers': 18,
+        'in_channels': 64,
+        'attention_head_dim': 128,
+        'joint_attention_dim': 1024,
+        'num_attention_heads': 8,
+        'audio_seq_len': 645,
+        'max_duration': 30,
+        'uncondition': False,
+        'text_encoder_name': "google/flan-t5-large"
+        }
+        model = TangoFlux(config)
+        print(f"Process {process_id}: Loading model from {args.model} on {device}")
+        w1 = load_file(args.model)
+        model.load_state_dict(w1, strict=False)
+        model = model.to(device)
+        model.eval()
+        # Initialize VAE
+        vae = AutoencoderOobleck.from_pretrained("stabilityai/stable-audio-open-1.0", subfolder='vae')
+        vae = vae.to(device)
+        vae.eval()
+        outputs = []
+        # Corrected loop using enumerate properly with tqdm
+        for idx, item in tqdm(enumerate(chunk), total=len(chunk), desc=f"GPU {gpu_id}"):
+            text = item['captions']
+            if os.path.exists(os.path.join(output_dir, f"id_{item['id']}_sample1.wav")):
+                print("Exist! Skipping!")
+                continue
+            with torch.no_grad():
+                latent = model.inference_flow(
+                    text,
+                    num_inference_steps=args.num_steps,
+                    guidance_scale=args.guidance_scale,
+                    duration=10,
+                    num_samples_per_prompt=args.num_samples
+                )
+                #waveform_end = int(duration * vae.config.sampling_rate)
+                latent = latent[:, :220, :]   ## 220 correspond to the latent length of audiocaps encoded with this vae. You can modify this
+                wave = vae.decode(latent.transpose(2, 1)).sample.cpu()
+                for i in range(args.num_samples):
+                    filename = f"id_{item['id']}_sample{i+1}.wav"
+                    filepath = os.path.join(output_dir, filename)
+                    sf.write(filepath, wave[i].T, samplerate)
+                    outputs.append({
+                        "id": item['id'],
+                        "sample": i + 1,
+                        "path": filepath,
+                        "captions": text
+                    })
+        return_dict[process_id] = outputs
+        print(f"Process {process_id}: Completed processing on GPU {gpu_id}")
+    except Exception as e:
+        print(f"Process {process_id}: Error on GPU {gpu_id}: {e}")
+        return_dict[process_id] = []
+def split_into_chunks(data, num_chunks):
+    """
+    Splits data into num_chunks approximately equal parts.
+    """
+    avg = len(data) // num_chunks
+    chunks = []
+    for i in range(num_chunks):
+        start = i * avg
+        # Ensure the last chunk takes the remainder
+        end = (i + 1) * avg if i != num_chunks - 1 else len(data)
+        chunks.append(data[start:end])
+    return chunks
+def main():
+    parser = argparse.ArgumentParser(description="Generate audio using multiple GPUs")
+    parser.add_argument('--num_steps', type=int, default=50, help='Number of inference steps')
+    parser.add_argument('--model', type=str, required=True, help='Path to tangoflux weights')
+    parser.add_argument('--num_samples', type=int, default=5, help='Number of samples per prompt')
+    parser.add_argument('--output_dir', type=str, default='output', help='Directory to save outputs')
+    parser.add_argument('--json_path', type=str, required=True, help='Path to input JSON file')
+    parser.add_argument('--sample_size', type=int, default=20000, help='Number of prompts to sample for CRPO')
+    parser.add_argument('--guidance_scale', type=float, default=4.5, help='Guidance scale used for generation')
+    args = parser.parse_args()
+    # Check GPU availability
+    num_gpus = torch.cuda.device_count()
+    sample_size = args.sample_size
+    # Load JSON data
+    import json
+    try:
+        with open(args.json_path, 'r') as f:
+            data = json.load(f)
+    except Exception as e:
+        print(f"Error loading JSON file {args.json_path}: {e}")
+        return
+    if not isinstance(data, list):
+        print("Error: JSON data is not a list.")
+        return
+    if len(data) < sample_size:
+        print(f"Warning: JSON data contains only {len(data)} items. Sampling all available data.")
+        sampled = data
+    else:
+        sampled = random.sample(data, sample_size)
+    # Split data into chunks based on available GPUs
+    random.shuffle(sampled)
+    chunks = split_into_chunks(sampled, num_gpus)
+    # Prepare output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    samplerate = 44100
+    # Manager for inter-process communication
+    manager = multiprocessing.Manager()
+    return_dict = manager.dict()
+    processes = []
+    for i in range(num_gpus):
+        p = multiprocessing.Process(
+            target=generate_audio_chunk,
+            args=(
+                args,
+                chunks[i],
+                i,  # GPU ID
+                args.output_dir,
+                samplerate,
+                return_dict,
+                i,  # Process ID
+            )
+        )
+        processes.append(p)
+        p.start()
+        print(f"Started process {i} on GPU {i}")
+    for p in processes:
+        p.join()
+        print(f"Process {p.pid} has finished.")
+    # Aggregate results
+    audio_info_list = [
+        [{
+            "path": f"{args.output_dir}/id_{sampled[j]['id']}_sample{i}.wav",
+            "duration": sampled[j]["duration"],
+            "captions": sampled[j]["captions"]
+        }
+        for i in range(1, args.num_samples+1) ] for j in range(sample_size)
+    ]
+    #print(audio_info_list)
+    with open(f'{args.output_dir}/results.json','w') as f:
+        json.dump(audio_info_list,f)
+    print(f"All audio samples have been generated and saved to {args.output_dir}")
+if __name__ == "__main__":
+    multiprocessing.set_start_method('spawn')
+    main()

external_models/TangoFlux/tangoflux/label_crpo.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import json
+import argparse
+import torch
+import laion_clap
+import numpy as np
+import multiprocessing
+from tqdm import tqdm
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Labelling clap score for crpo dataset"
+    )
+    parser.add_argument(
+        "--num_samples", type=int, default=5,
+        help="Number of audio samples per prompt"
+    )
+    parser.add_argument(
+        "--json_path", type=str, required=True,
+        help="Path to input JSON file"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, required=True,
+        help="Directory to save the final JSON with CLAP scores"
+    )
+    return parser.parse_args()
+#python3 label_clap.py --json_path=/mnt/data/chiayu/crpo/crpo_iteration1/results.json --output_dir=/mnt/data/chiayu/crpo/crpo_iteration1
+@torch.no_grad()
+def compute_clap(model, audio_files, text_data):
+    # Compute audio and text embeddings, then compute the dot product (CLAP score)
+    audio_embed = model.get_audio_embedding_from_filelist(x=audio_files, use_tensor=True)
+    text_embed = model.get_text_embedding(text_data, use_tensor=True)
+    return audio_embed @ text_embed.T
+def process_chunk(args, chunk, gpu_id, return_dict, process_id):
+    """
+    Process a chunk of the data on a specific GPU.
+    Loads the CLAP model on the designated device, then for each item in the chunk,
+    computes the CLAP scores and attaches them to the data.
+    """
+    try:
+        device = f"cuda:{gpu_id}"
+        torch.cuda.set_device(device)
+        print(f"Process {process_id}: Using device {device}")
+        # Initialize the CLAP model on this GPU
+        model = laion_clap.CLAP_Module(enable_fusion=False)
+        model.to(device)
+        model.load_ckpt()
+        model.eval()
+        for j, item in enumerate(tqdm(chunk, desc=f"GPU {gpu_id}")):
+            # Each item is assumed to be a list of samples.
+            # Skip if already computed.
+            if 'clap_score' in item[0]:
+                continue
+            # Collect audio file paths and text data (using the first caption)
+            audio_files = [item[i]['path'] for i in range(args.num_samples)]
+            text_data = [item[0]['captions']]
+            try:
+                clap_scores = compute_clap(model, audio_files, text_data)
+            except Exception as e:
+                print(f"Error processing item index {j} on GPU {gpu_id}: {e}")
+                continue
+            # Attach the computed score to each sample in the item
+            for k in range(args.num_samples):
+                item[k]['clap_score'] = np.round(clap_scores[k].item(), 3)
+        return_dict[process_id] = chunk
+        print(f"Process {process_id}: Completed processing on GPU {gpu_id}")
+    except Exception as e:
+        print(f"Process {process_id}: Error on GPU {gpu_id}: {e}")
+        return_dict[process_id] = []
+def split_into_chunks(data, num_chunks):
+    """
+    Splits data into num_chunks approximately equal parts.
+    """
+    avg = len(data) // num_chunks
+    chunks = []
+    for i in range(num_chunks):
+        start = i * avg
+        # Ensure the last chunk takes the remainder of the data
+        end = (i + 1) * avg if i != num_chunks - 1 else len(data)
+        chunks.append(data[start:end])
+    return chunks
+def main():
+    args = parse_args()
+    # Load data from JSON and slice by start/end if provided
+    with open(args.json_path, 'r') as f:
+        data = json.load(f)
+    # Check GPU availability and split data accordingly
+    num_gpus = torch.cuda.device_count()
+    print(f"Found {num_gpus} GPUs. Splitting data into {num_gpus} chunks.")
+    chunks = split_into_chunks(data, num_gpus)
+    # Prepare output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Create a manager dict to collect results from all processes
+    manager = multiprocessing.Manager()
+    return_dict = manager.dict()
+    processes = []
+    for i in range(num_gpus):
+        p = multiprocessing.Process(
+            target=process_chunk,
+            args=(args, chunks[i], i, return_dict, i)
+        )
+        processes.append(p)
+        p.start()
+        print(f"Started process {i} on GPU {i}")
+    for p in processes:
+        p.join()
+        print(f"Process {p.pid} has finished.")
+    # Aggregate all chunks back into a single list
+    combined_data = []
+    for i in range(num_gpus):
+        combined_data.extend(return_dict[i])
+    # Save the combined results to a single JSON file
+    output_file =  f"{args.output_dir}/clap_scores.json"
+    with open(output_file, 'w') as f:
+        json.dump(combined_data, f)
+    print(f"All CLAP scores have been computed and saved to {output_file}")
+    max_item = [max(x, key=lambda item: item['clap_score']) for x in combined_data]
+    min_item = [min(x, key=lambda item: item['clap_score']) for x in combined_data]
+    crpo_dataset = []
+    for chosen,reject in zip(max_item,min_item):
+        crpo_dataset.append({"captions": chosen['captions'],
+        "duration": chosen['duration'],
+        "chosen": chosen['path'],
+        "reject": reject['path']})
+    with open(f"{args.output_dir}/train.json",'w') as f:
+        json.dump(crpo_dataset,f)
+if __name__ == '__main__':
+    multiprocessing.set_start_method('spawn')
+    main()

external_models/TangoFlux/tangoflux/model.py ADDED Viewed

	@@ -0,0 +1,556 @@

+from transformers import T5EncoderModel, T5TokenizerFast
+import torch
+from diffusers import FluxTransformer2DModel
+from torch import nn
+import random
+from typing import List
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling
+import copy
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from typing import Optional, Union, List
+from datasets import load_dataset, Audio
+from math import pi
+import inspect
+import yaml
+class StableAudioPositionalEmbedding(nn.Module):
+    """Used for continuous time
+    Adapted from Stable Audio Open.
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, times: torch.Tensor) -> torch.Tensor:
+        times = times[..., None]
+        freqs = times * self.weights[None] * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((times, fouriered), dim=-1)
+        return fouriered
+class DurationEmbedder(nn.Module):
+    """
+    A simple linear projection model to map numbers to a latent space.
+    Code is adapted from
+    https://github.com/Stability-AI/stable-audio-tools
+    Args:
+        number_embedding_dim (`int`):
+            Dimensionality of the number embeddings.
+        min_value (`int`):
+            The minimum value of the seconds number conditioning modules.
+        max_value (`int`):
+            The maximum value of the seconds number conditioning modules
+        internal_dim (`int`):
+            Dimensionality of the intermediate number hidden states.
+    """
+    def __init__(
+        self,
+        number_embedding_dim,
+        min_value,
+        max_value,
+        internal_dim: Optional[int] = 256,
+    ):
+        super().__init__()
+        self.time_positional_embedding = nn.Sequential(
+            StableAudioPositionalEmbedding(internal_dim),
+            nn.Linear(in_features=internal_dim + 1, out_features=number_embedding_dim),
+        )
+        self.number_embedding_dim = number_embedding_dim
+        self.min_value = min_value
+        self.max_value = max_value
+        self.dtype = torch.float32
+    def forward(
+        self,
+        floats: torch.Tensor,
+    ):
+        floats = floats.clamp(self.min_value, self.max_value)
+        normalized_floats = (floats - self.min_value) / (
+            self.max_value - self.min_value
+        )
+        # Cast floats to same type as embedder
+        embedder_dtype = next(self.time_positional_embedding.parameters()).dtype
+        normalized_floats = normalized_floats.to(embedder_dtype)
+        embedding = self.time_positional_embedding(normalized_floats)
+        float_embeds = embedding.view(-1, 1, self.number_embedding_dim)
+        return float_embeds
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class TangoFlux(nn.Module):
+    def __init__(self, config, text_encoder_dir=None, initialize_reference_model=False,):
+        super().__init__()
+        self.num_layers = config.get("num_layers", 6)
+        self.num_single_layers = config.get("num_single_layers", 18)
+        self.in_channels = config.get("in_channels", 64)
+        self.attention_head_dim = config.get("attention_head_dim", 128)
+        self.joint_attention_dim = config.get("joint_attention_dim", 1024)
+        self.num_attention_heads = config.get("num_attention_heads", 8)
+        self.audio_seq_len = config.get("audio_seq_len", 645)
+        self.max_duration = config.get("max_duration", 30)
+        self.uncondition = config.get("uncondition", False)
+        self.text_encoder_name = config.get("text_encoder_name", "google/flan-t5-large")
+        self.noise_scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000)
+        self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
+        self.max_text_seq_len = 64
+        self.text_encoder = T5EncoderModel.from_pretrained(
+            text_encoder_dir if text_encoder_dir is not None else self.text_encoder_name
+        )
+        self.tokenizer = T5TokenizerFast.from_pretrained(
+            text_encoder_dir if text_encoder_dir is not None else self.text_encoder_name
+        )
+        self.text_embedding_dim = self.text_encoder.config.d_model
+        self.fc = nn.Sequential(
+            nn.Linear(self.text_embedding_dim, self.joint_attention_dim), nn.ReLU()
+        )
+        self.duration_emebdder = DurationEmbedder(
+            self.text_embedding_dim, min_value=0, max_value=self.max_duration
+        )
+        self.transformer = FluxTransformer2DModel(
+            in_channels=self.in_channels,
+            num_layers=self.num_layers,
+            num_single_layers=self.num_single_layers,
+            attention_head_dim=self.attention_head_dim,
+            num_attention_heads=self.num_attention_heads,
+            joint_attention_dim=self.joint_attention_dim,
+            pooled_projection_dim=self.text_embedding_dim,
+            guidance_embeds=False,
+        )
+        self.beta_dpo = 2000  ## this is used for dpo training
+    def get_sigmas(self, timesteps, n_dim=3, dtype=torch.float32):
+        device = self.text_encoder.device
+        sigmas = self.noise_scheduler_copy.sigmas.to(device=device, dtype=dtype)
+        schedule_timesteps = self.noise_scheduler_copy.timesteps.to(device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    def encode_text_classifier_free(self, prompt: List[str], num_samples_per_prompt=1):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
+            device
+        )
+        with torch.no_grad():
+            prompt_embeds = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # get unconditional embeddings for classifier free guidance
+        uncond_tokens = [""]
+        max_length = prompt_embeds.shape[1]
+        uncond_batch = self.tokenizer(
+            uncond_tokens,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        uncond_input_ids = uncond_batch.input_ids.to(device)
+        uncond_attention_mask = uncond_batch.attention_mask.to(device)
+        with torch.no_grad():
+            negative_prompt_embeds = self.text_encoder(
+                input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
+            )[0]
+        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(
+            num_samples_per_prompt, 0
+        )
+        uncond_attention_mask = uncond_attention_mask.repeat_interleave(
+            num_samples_per_prompt, 0
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
+        boolean_prompt_mask = (prompt_mask == 1).to(device)
+        return prompt_embeds, boolean_prompt_mask
+    @torch.no_grad()
+    def encode_text(self, prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt,
+            max_length=self.max_text_seq_len,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(
+            device
+        )
+        encoder_hidden_states = self.text_encoder(
+            input_ids=input_ids, attention_mask=attention_mask
+        )[0]
+        boolean_encoder_mask = (attention_mask == 1).to(device)
+        return encoder_hidden_states, boolean_encoder_mask
+    def encode_duration(self, duration):
+        return self.duration_emebdder(duration)
+    @torch.no_grad()
+    def inference_flow(
+        self,
+        prompt,
+        num_inference_steps=50,
+        timesteps=None,
+        guidance_scale=3,
+        duration=10,
+        seed=0,
+        disable_progress=False,
+        num_samples_per_prompt=1,
+        callback_on_step_end=None,
+    ):
+        """Only tested for single inference. Haven't test for batch inference"""
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        bsz = num_samples_per_prompt
+        device = self.transformer.device
+        scheduler = self.noise_scheduler
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+        if not isinstance(duration, torch.Tensor):
+            duration = torch.tensor([duration], device=device)
+        classifier_free_guidance = guidance_scale > 1.0
+        duration_hidden_states = self.encode_duration(duration)
+        if classifier_free_guidance:
+            bsz = 2 * num_samples_per_prompt
+            encoder_hidden_states, boolean_encoder_mask = (
+                self.encode_text_classifier_free(
+                    prompt, num_samples_per_prompt=num_samples_per_prompt
+                )
+            )
+            duration_hidden_states = duration_hidden_states.repeat(bsz, 1, 1)
+        else:
+            encoder_hidden_states, boolean_encoder_mask = self.encode_text(
+                prompt, num_samples_per_prompt=num_samples_per_prompt
+            )
+        mask_expanded = boolean_encoder_mask.unsqueeze(-1).expand_as(
+            encoder_hidden_states
+        )
+        masked_data = torch.where(
+            mask_expanded, encoder_hidden_states, torch.tensor(float("nan"))
+        )
+        pooled = torch.nanmean(masked_data, dim=1)
+        pooled_projection = self.fc(pooled)
+        encoder_hidden_states = torch.cat(
+            [encoder_hidden_states, duration_hidden_states], dim=1
+        )  ## (bs,seq_len,dim)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        timesteps, num_inference_steps = retrieve_timesteps(
+            scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        latents = torch.randn(num_samples_per_prompt, self.audio_seq_len, 64)
+        weight_dtype = latents.dtype
+        progress_bar = tqdm(range(num_inference_steps), disable=disable_progress)
+        txt_ids = torch.zeros(bsz, encoder_hidden_states.shape[1], 3).to(device)
+        audio_ids = (
+            torch.arange(self.audio_seq_len)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .repeat(bsz, 1, 3)
+            .to(device)
+        )
+        timesteps = timesteps.to(device)
+        latents = latents.to(device)
+        encoder_hidden_states = encoder_hidden_states.to(device)
+        for i, t in enumerate(timesteps):
+            latents_input = (
+                torch.cat([latents] * 2) if classifier_free_guidance else latents
+            )
+            noise_pred = self.transformer(
+                hidden_states=latents_input,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=torch.tensor([t / 1000], device=device),
+                guidance=None,
+                pooled_projections=pooled_projection,
+                encoder_hidden_states=encoder_hidden_states,
+                txt_ids=txt_ids,
+                img_ids=audio_ids,
+                return_dict=False,
+            )[0]
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+            progress_bar.update(1)
+            if callback_on_step_end is not None:
+                callback_on_step_end()
+        return latents
+    def forward(self, latents, prompt, duration=torch.tensor([10]), sft=True):
+        device = latents.device
+        audio_seq_length = self.audio_seq_len
+        bsz = latents.shape[0]
+        encoder_hidden_states, boolean_encoder_mask = self.encode_text(prompt)
+        duration_hidden_states = self.encode_duration(duration)
+        mask_expanded = boolean_encoder_mask.unsqueeze(-1).expand_as(
+            encoder_hidden_states
+        )
+        masked_data = torch.where(
+            mask_expanded, encoder_hidden_states, torch.tensor(float("nan"))
+        )
+        pooled = torch.nanmean(masked_data, dim=1)
+        pooled_projection = self.fc(pooled)
+        ## Add duration hidden states to encoder hidden states
+        encoder_hidden_states = torch.cat(
+            [encoder_hidden_states, duration_hidden_states], dim=1
+        )  ## (bs,seq_len,dim)
+        txt_ids = torch.zeros(bsz, encoder_hidden_states.shape[1], 3).to(device)
+        audio_ids = (
+            torch.arange(audio_seq_length)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .repeat(bsz, 1, 3)
+            .to(device)
+        )
+        if sft:
+            if self.uncondition:
+                mask_indices = [k for k in range(len(prompt)) if random.random() < 0.1]
+                if len(mask_indices) > 0:
+                    encoder_hidden_states[mask_indices] = 0
+            noise = torch.randn_like(latents)
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme="logit_normal",
+                batch_size=bsz,
+                logit_mean=0,
+                logit_std=1,
+                mode_scale=None,
+            )
+            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
+            timesteps = self.noise_scheduler_copy.timesteps[indices].to(
+                device=latents.device
+            )
+            sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
+            noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
+            model_pred = self.transformer(
+                hidden_states=noisy_model_input,
+                encoder_hidden_states=encoder_hidden_states,
+                pooled_projections=pooled_projection,
+                img_ids=audio_ids,
+                txt_ids=txt_ids,
+                guidance=None,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=timesteps / 1000,
+                return_dict=False,
+            )[0]
+            target = noise - latents
+            loss = torch.mean(
+                ((model_pred.float() - target.float()) ** 2).reshape(
+                    target.shape[0], -1
+                ),
+                1,
+            )
+            loss = loss.mean()
+            raw_model_loss, raw_ref_loss, implicit_acc = (
+                0,
+                0,
+                0,
+            )  ## default this to 0 if doing sft
+        else:
+            encoder_hidden_states = encoder_hidden_states.repeat(2, 1, 1)
+            pooled_projection = pooled_projection.repeat(2, 1)
+            noise = (
+                torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1)
+            )  ## Have to sample same noise for preferred and rejected
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme="logit_normal",
+                batch_size=bsz // 2,
+                logit_mean=0,
+                logit_std=1,
+                mode_scale=None,
+            )
+            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
+            timesteps = self.noise_scheduler_copy.timesteps[indices].to(
+                device=latents.device
+            )
+            timesteps = timesteps.repeat(2)
+            sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
+            noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
+            model_pred = self.transformer(
+                hidden_states=noisy_model_input,
+                encoder_hidden_states=encoder_hidden_states,
+                pooled_projections=pooled_projection,
+                img_ids=audio_ids,
+                txt_ids=txt_ids,
+                guidance=None,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=timesteps / 1000,
+                return_dict=False,
+            )[0]
+            target = noise - latents
+            model_losses = F.mse_loss(
+                model_pred.float(), target.float(), reduction="none"
+            )
+            model_losses = model_losses.mean(
+                dim=list(range(1, len(model_losses.shape)))
+            )
+            model_losses_w, model_losses_l = model_losses.chunk(2)
+            model_diff = model_losses_w - model_losses_l
+            raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
+            with torch.no_grad():
+                ref_preds = self.ref_transformer(
+                    hidden_states=noisy_model_input,
+                    encoder_hidden_states=encoder_hidden_states,
+                    pooled_projections=pooled_projection,
+                    img_ids=audio_ids,
+                    txt_ids=txt_ids,
+                    guidance=None,
+                    timestep=timesteps / 1000,
+                    return_dict=False,
+                )[0]
+                ref_loss = F.mse_loss(
+                    ref_preds.float(), target.float(), reduction="none"
+                )
+                ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
+                ref_losses_w, ref_losses_l = ref_loss.chunk(2)
+                ref_diff = ref_losses_w - ref_losses_l
+                raw_ref_loss = ref_loss.mean()
+            scale_term = -0.5 * self.beta_dpo
+            inside_term = scale_term * (model_diff - ref_diff)
+            implicit_acc = (
+                scale_term * (model_diff - ref_diff) > 0
+            ).sum().float() / inside_term.size(0)
+            loss = -1 * F.logsigmoid(inside_term).mean() + model_losses_w.mean()
+        ## raw_model_loss, raw_ref_loss, implicit_acc is used to help to analyze dpo behaviour.
+        return loss, raw_model_loss, raw_ref_loss, implicit_acc

external_models/TangoFlux/tangoflux/train.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import time
+import argparse
+import json
+import logging
+import math
+import os
+import yaml
+from pathlib import Path
+import diffusers
+import datasets
+import numpy as np
+import pandas as pd
+import wandb
+import transformers
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from torch.utils.data import Dataset, DataLoader
+from tqdm.auto import tqdm
+from transformers import SchedulerType, get_scheduler
+from model import TangoFlux
+from datasets import load_dataset, Audio
+from utils import Text2AudioDataset, read_wav_file, pad_wav
+from diffusers import AutoencoderOobleck
+import torchaudio
+logger = get_logger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Rectified flow for text to audio generation task."
+    )
+    parser.add_argument(
+        "--num_examples",
+        type=int,
+        default=-1,
+        help="How many examples to use for training and validation.",
+    )
+    parser.add_argument(
+        "--text_column",
+        type=str,
+        default="captions",
+        help="The name of the column in the datasets containing the input texts.",
+    )
+    parser.add_argument(
+        "--audio_column",
+        type=str,
+        default="location",
+        help="The name of the column in the datasets containing the audio paths.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="The beta1 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.95,
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="tangoflux_config.yaml",
+        help="Config file defining the model size as well as other hyper parameter.",
+    )
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        default="",
+        help="Add prefix in text prompts.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=3e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=1e-8, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=1e-2,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default="best",
+        help="Whether the various states should be saved at the end of every 'epoch' or 'best' whenever validation loss decreases.",
+    )
+    parser.add_argument(
+        "--save_every",
+        type=int,
+        default=5,
+        help="Save model after every how many epochs when checkpointing_steps is set to best.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a local checkpoint folder.",
+    )
+    parser.add_argument(
+        "--load_from_checkpoint",
+        type=str,
+        default=None,
+        help="Whether to continue training from a model weight",
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    accelerator_log_kwargs = {}
+    def load_config(config_path):
+        with open(config_path, "r") as file:
+            return yaml.safe_load(file)
+    config = load_config(args.config)
+    learning_rate = float(config["training"]["learning_rate"])
+    num_train_epochs = int(config["training"]["num_train_epochs"])
+    num_warmup_steps = int(config["training"]["num_warmup_steps"])
+    per_device_batch_size = int(config["training"]["per_device_batch_size"])
+    gradient_accumulation_steps = int(config["training"]["gradient_accumulation_steps"])
+    output_dir = config["paths"]["output_dir"]
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        **accelerator_log_kwargs,
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    datasets.utils.logging.set_verbosity_error()
+    diffusers.utils.logging.set_verbosity_error()
+    transformers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle output directory creation and wandb tracking
+    if accelerator.is_main_process:
+        if output_dir is None or output_dir == "":
+            output_dir = "saved/" + str(int(time.time()))
+            if not os.path.exists("saved"):
+                os.makedirs("saved")
+            os.makedirs(output_dir, exist_ok=True)
+        elif output_dir is not None:
+            os.makedirs(output_dir, exist_ok=True)
+        os.makedirs("{}/{}".format(output_dir, "outputs"), exist_ok=True)
+        with open("{}/summary.jsonl".format(output_dir), "a") as f:
+            f.write(json.dumps(dict(vars(args))) + "\n\n")
+        accelerator.project_configuration.automatic_checkpoint_naming = False
+        wandb.init(
+            project="Text to Audio Flow matching",
+            settings=wandb.Settings(_disable_stats=True),
+        )
+    accelerator.wait_for_everyone()
+    # Get the datasets
+    data_files = {}
+    # if args.train_file is not None:
+    if config["paths"]["train_file"] != "":
+        data_files["train"] = config["paths"]["train_file"]
+    # if args.validation_file is not None:
+    if config["paths"]["val_file"] != "":
+        data_files["validation"] = config["paths"]["val_file"]
+    if config["paths"]["test_file"] != "":
+        data_files["test"] = config["paths"]["test_file"]
+    else:
+        data_files["test"] = config["paths"]["val_file"]
+    extension = "json"
+    raw_datasets = load_dataset(extension, data_files=data_files)
+    text_column, audio_column = args.text_column, args.audio_column
+    model = TangoFlux(config=config["model"])
+    vae = AutoencoderOobleck.from_pretrained(
+        "stabilityai/stable-audio-open-1.0", subfolder="vae"
+    )
+    ## Freeze vae
+    for param in vae.parameters():
+        vae.requires_grad = False
+        vae.eval()
+    ## Freeze text encoder param
+    for param in model.text_encoder.parameters():
+        param.requires_grad = False
+        model.text_encoder.eval()
+    prefix = args.prefix
+    with accelerator.main_process_first():
+        train_dataset = Text2AudioDataset(
+            raw_datasets["train"],
+            prefix,
+            text_column,
+            audio_column,
+            "duration",
+            args.num_examples,
+        )
+        eval_dataset = Text2AudioDataset(
+            raw_datasets["validation"],
+            prefix,
+            text_column,
+            audio_column,
+            "duration",
+            args.num_examples,
+        )
+        test_dataset = Text2AudioDataset(
+            raw_datasets["test"],
+            prefix,
+            text_column,
+            audio_column,
+            "duration",
+            args.num_examples,
+        )
+        accelerator.print(
+            "Num instances in train: {}, validation: {}, test: {}".format(
+                train_dataset.get_num_instances(),
+                eval_dataset.get_num_instances(),
+                test_dataset.get_num_instances(),
+            )
+        )
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=train_dataset.collate_fn,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        shuffle=True,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=eval_dataset.collate_fn,
+    )
+    test_dataloader = DataLoader(
+        test_dataset,
+        shuffle=False,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=test_dataset.collate_fn,
+    )
+    # Optimizer
+    optimizer_parameters = list(model.transformer.parameters()) + list(
+        model.fc.parameters()
+    )
+    num_trainable_parameters = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    accelerator.print("Num trainable parameters: {}".format(num_trainable_parameters))
+    if args.load_from_checkpoint:
+        from safetensors.torch import load_file
+        w1 = load_file(args.load_from_checkpoint)
+        model.load_state_dict(w1, strict=False)
+        logger.info("Weights loaded from{}".format(args.load_from_checkpoint))
+    optimizer = torch.optim.AdamW(
+        optimizer_parameters,
+        lr=learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps
+        * gradient_accumulation_steps
+        * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * gradient_accumulation_steps,
+    )
+    # Prepare everything with our `accelerator`.
+    vae, model, optimizer, lr_scheduler = accelerator.prepare(
+        vae, model, optimizer, lr_scheduler
+    )
+    train_dataloader, eval_dataloader, test_dataloader = accelerator.prepare(
+        train_dataloader, eval_dataloader, test_dataloader
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    # Train!
+    total_batch_size = (
+        per_device_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    )
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {per_device_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+    starting_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    resume_from_checkpoint = config["paths"]["resume_from_checkpoint"]
+    if resume_from_checkpoint != "":
+        accelerator.load_state(resume_from_checkpoint)
+        accelerator.print(f"Resumed from local checkpoint: {resume_from_checkpoint}")
+    # Duration of the audio clips in seconds
+    best_loss = np.inf
+    length = config["training"]["max_audio_duration"]
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        total_loss, total_val_loss = 0, 0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(model):
+                optimizer.zero_grad()
+                device = model.device
+                text, audios, duration, _ = batch
+                with torch.no_grad():
+                    audio_list = []
+                    for audio_path in audios:
+                        wav = read_wav_file(
+                            audio_path, length
+                        )  ## Only read the first 30 seconds of audio
+                        if (
+                            wav.shape[0] == 1
+                        ):  ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                            wav = wav.repeat(2, 1)
+                        audio_list.append(wav)
+                    audio_input = torch.stack(audio_list, dim=0)
+                    audio_input = audio_input.to(device)
+                    unwrapped_vae = accelerator.unwrap_model(vae)
+                    duration = torch.tensor(duration, device=device)
+                    duration = torch.clamp(
+                        duration, max=length
+                    )  ## clamp duration to max audio length
+                    audio_latent = unwrapped_vae.encode(
+                        audio_input
+                    ).latent_dist.sample()
+                    audio_latent = audio_latent.transpose(
+                        1, 2
+                    )  ## Tranpose  to (bsz, seq_len, channel)
+                loss, _, _, _ = model(audio_latent, text, duration=duration)
+                total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    completed_steps += 1
+                optimizer.step()
+                lr_scheduler.step()
+            if completed_steps % 10 == 0 and accelerator.is_main_process:
+                total_norm = 0.0
+                for p in model.parameters():
+                    if p.grad is not None:
+                        param_norm = p.grad.data.norm(2)
+                        total_norm += param_norm.item() ** 2
+                total_norm = total_norm**0.5
+                logger.info(
+                    f"Step {completed_steps}, Loss: {loss.item()}, Grad Norm: {total_norm}"
+                )
+                lr = lr_scheduler.get_last_lr()[0]
+                result = {
+                    "train_loss": loss.item(),
+                    "grad_norm": total_norm,
+                    "learning_rate": lr,
+                }
+                # result["val_loss"] = round(total_val_loss.item()/len(eval_dataloader), 4)
+                wandb.log(result, step=completed_steps)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if output_dir is not None:
+                        output_dir = os.path.join(output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+        if completed_steps >= args.max_train_steps:
+            break
+        model.eval()
+        eval_progress_bar = tqdm(
+            range(len(eval_dataloader)), disable=not accelerator.is_local_main_process
+        )
+        for step, batch in enumerate(eval_dataloader):
+            with accelerator.accumulate(model) and torch.no_grad():
+                device = model.device
+                text, audios, duration, _ = batch
+                audio_list = []
+                for audio_path in audios:
+                    wav = read_wav_file(
+                        audio_path, length
+                    )  ## make sure none of audio exceed 30 sec
+                    if (
+                        wav.shape[0] == 1
+                    ):  ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                        wav = wav.repeat(2, 1)
+                    audio_list.append(wav)
+                audio_input = torch.stack(audio_list, dim=0)
+                audio_input = audio_input.to(device)
+                duration = torch.tensor(duration, device=device)
+                unwrapped_vae = accelerator.unwrap_model(vae)
+                audio_latent = unwrapped_vae.encode(audio_input).latent_dist.sample()
+                audio_latent = audio_latent.transpose(
+                    1, 2
+                )  ## Tranpose  to (bsz, seq_len, channel)
+                val_loss, _, _, _ = model(audio_latent, text, duration=duration)
+                total_val_loss += val_loss.detach().float()
+                eval_progress_bar.update(1)
+        if accelerator.is_main_process:
+            result = {}
+            result["epoch"] = float(epoch + 1)
+            result["epoch/train_loss"] = round(
+                total_loss.item() / len(train_dataloader), 4
+            )
+            result["epoch/val_loss"] = round(
+                total_val_loss.item() / len(eval_dataloader), 4
+            )
+            wandb.log(result, step=completed_steps)
+            result_string = "Epoch: {}, Loss Train: {}, Val: {}\n".format(
+                epoch, result["epoch/train_loss"], result["epoch/val_loss"]
+            )
+            accelerator.print(result_string)
+            with open("{}/summary.jsonl".format(output_dir), "a") as f:
+                f.write(json.dumps(result) + "\n\n")
+            logger.info(result)
+            if result["epoch/val_loss"] < best_loss:
+                best_loss = result["epoch/val_loss"]
+                save_checkpoint = True
+            else:
+                save_checkpoint = False
+        accelerator.wait_for_everyone()
+        if accelerator.is_main_process and args.checkpointing_steps == "best":
+            if save_checkpoint:
+                accelerator.save_state("{}/{}".format(output_dir, "best"))
+            if (epoch + 1) % args.save_every == 0:
+                accelerator.save_state(
+                    "{}/{}".format(output_dir, "epoch_" + str(epoch + 1))
+                )
+        if accelerator.is_main_process and args.checkpointing_steps == "epoch":
+            accelerator.save_state(
+                "{}/{}".format(output_dir, "epoch_" + str(epoch + 1))
+            )
+if __name__ == "__main__":
+    main()

external_models/TangoFlux/tangoflux/train_dpo.py ADDED Viewed

	@@ -0,0 +1,608 @@

+import time
+import argparse
+import json
+import logging
+import math
+import os
+import yaml
+# from tqdm import tqdm
+import copy
+from pathlib import Path
+import diffusers
+import datasets
+import numpy as np
+import pandas as pd
+import wandb
+import transformers
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from torch.utils.data import Dataset, DataLoader
+from tqdm.auto import tqdm
+from transformers import SchedulerType, get_scheduler
+from tangoflux.model import TangoFlux
+from datasets import load_dataset, Audio
+from tangoflux.utils import Text2AudioDataset, read_wav_file, DPOText2AudioDataset
+from diffusers import AutoencoderOobleck
+import torchaudio
+logger = get_logger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Rectified flow for text to audio generation task."
+    )
+    parser.add_argument(
+        "--num_examples",
+        type=int,
+        default=-1,
+        help="How many examples to use for training and validation.",
+    )
+    parser.add_argument(
+        "--text_column",
+        type=str,
+        default="captions",
+        help="The name of the column in the datasets containing the input texts.",
+    )
+    parser.add_argument(
+        "--audio_column",
+        type=str,
+        default="location",
+        help="The name of the column in the datasets containing the audio paths.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="The beta1 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.95,
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="tangoflux_config.yaml",
+        help="Config file defining the model size.",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=1e-8, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--adam_weight_decay",
+        type=float,
+        default=1e-2,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default="best",
+        help="Whether the various states should be saved at the end of every 'epoch' or 'best' whenever validation loss decreases.",
+    )
+    parser.add_argument(
+        "--save_every",
+        type=int,
+        default=5,
+        help="Save model after every how many epochs when checkpointing_steps is set to best.",
+    )
+    parser.add_argument(
+        "--load_from_checkpoint",
+        type=str,
+        default=None,
+        help="Whether to continue training from a model weight",
+    )
+    args = parser.parse_args()
+    # Sanity checks
+    # if args.train_file is None and args.validation_file is None:
+    #   raise ValueError("Need a training/validation file.")
+    # else:
+    #  if args.train_file is not None:
+    #     extension = args.train_file.split(".")[-1]
+    #    assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+    # if args.validation_file is not None:
+    #   extension = args.validation_file.split(".")[-1]
+    #  assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+    return args
+def main():
+    args = parse_args()
+    accelerator_log_kwargs = {}
+    def load_config(config_path):
+        with open(config_path, "r") as file:
+            return yaml.safe_load(file)
+    config = load_config(args.config)
+    learning_rate = float(config["training"]["learning_rate"])
+    num_train_epochs = int(config["training"]["num_train_epochs"])
+    num_warmup_steps = int(config["training"]["num_warmup_steps"])
+    per_device_batch_size = int(config["training"]["per_device_batch_size"])
+    gradient_accumulation_steps = int(config["training"]["gradient_accumulation_steps"])
+    output_dir = config["paths"]["output_dir"]
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        **accelerator_log_kwargs,
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    datasets.utils.logging.set_verbosity_error()
+    diffusers.utils.logging.set_verbosity_error()
+    transformers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle output directory creation and wandb tracking
+    if accelerator.is_main_process:
+        if output_dir is None or output_dir == "":
+            output_dir = "saved/" + str(int(time.time()))
+            if not os.path.exists("saved"):
+                os.makedirs("saved")
+            os.makedirs(output_dir, exist_ok=True)
+        elif output_dir is not None:
+            os.makedirs(output_dir, exist_ok=True)
+        os.makedirs("{}/{}".format(output_dir, "outputs"), exist_ok=True)
+        with open("{}/summary.jsonl".format(output_dir), "a") as f:
+            f.write(json.dumps(dict(vars(args))) + "\n\n")
+        accelerator.project_configuration.automatic_checkpoint_naming = False
+        wandb.init(
+            project="Text to Audio Flow matching DPO",
+            settings=wandb.Settings(_disable_stats=True),
+        )
+    accelerator.wait_for_everyone()
+    # Get the datasets
+    data_files = {}
+    # if args.train_file is not None:
+    if config["paths"]["train_file"] != "":
+        data_files["train"] = config["paths"]["train_file"]
+    # if args.validation_file is not None:
+    if config["paths"]["val_file"] != "":
+        data_files["validation"] = config["paths"]["val_file"]
+    if config["paths"]["test_file"] != "":
+        data_files["test"] = config["paths"]["test_file"]
+    else:
+        data_files["test"] = config["paths"]["val_file"]
+    extension = "json"
+    train_dataset = load_dataset(extension, data_files=data_files["train"])
+    data_files.pop("train")
+    raw_datasets = load_dataset(extension, data_files=data_files)
+    text_column, audio_column = args.text_column, args.audio_column
+    model = TangoFlux(config=config["model"], initialize_reference_model=True)
+    vae = AutoencoderOobleck.from_pretrained(
+        "stabilityai/stable-audio-open-1.0", subfolder="vae"
+    )
+    ## Freeze vae
+    for param in vae.parameters():
+        vae.requires_grad = False
+        vae.eval()
+    ## Freeze text encoder param
+    for param in model.text_encoder.parameters():
+        param.requires_grad = False
+        model.text_encoder.eval()
+    prefix = ""
+    with accelerator.main_process_first():
+        train_dataset = DPOText2AudioDataset(
+            train_dataset["train"],
+            prefix,
+            text_column,
+            "chosen",
+            "reject",
+            "duration",
+            args.num_examples,
+        )
+        eval_dataset = Text2AudioDataset(
+            raw_datasets["validation"],
+            prefix,
+            text_column,
+            audio_column,
+            "duration",
+            args.num_examples,
+        )
+        test_dataset = Text2AudioDataset(
+            raw_datasets["test"],
+            prefix,
+            text_column,
+            audio_column,
+            "duration",
+            args.num_examples,
+        )
+        accelerator.print(
+            "Num instances in train: {}, validation: {}, test: {}".format(
+                train_dataset.get_num_instances(),
+                eval_dataset.get_num_instances(),
+                test_dataset.get_num_instances(),
+            )
+        )
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=train_dataset.collate_fn,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        shuffle=True,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=eval_dataset.collate_fn,
+    )
+    test_dataloader = DataLoader(
+        test_dataset,
+        shuffle=False,
+        batch_size=config["training"]["per_device_batch_size"],
+        collate_fn=test_dataset.collate_fn,
+    )
+    # Optimizer
+    optimizer_parameters = list(model.transformer.parameters()) + list(
+        model.fc.parameters()
+    )
+    num_trainable_parameters = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    accelerator.print("Num trainable parameters: {}".format(num_trainable_parameters))
+    if args.load_from_checkpoint:
+        from safetensors.torch import load_file
+        w1 = load_file(args.load_from_checkpoint)
+        model.load_state_dict(w1, strict=False)
+        logger.info("Weights loaded from{}".format(args.load_from_checkpoint))
+    import copy
+    model.ref_transformer = copy.deepcopy(model.transformer)
+    model.ref_transformer.requires_grad_ = False
+    model.ref_transformer.eval()
+    for param in model.ref_transformer.parameters():
+        param.requires_grad = False
+    optimizer = torch.optim.AdamW(
+        optimizer_parameters,
+        lr=learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps
+        * gradient_accumulation_steps
+        * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * gradient_accumulation_steps,
+    )
+    # Prepare everything with our `accelerator`.
+    vae, model, optimizer, lr_scheduler = accelerator.prepare(
+        vae, model, optimizer, lr_scheduler
+    )
+    train_dataloader, eval_dataloader, test_dataloader = accelerator.prepare(
+        train_dataloader, eval_dataloader, test_dataloader
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+    # Train!
+    total_batch_size = (
+        per_device_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    )
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {per_device_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+    starting_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    resume_from_checkpoint = config["paths"]["resume_from_checkpoint"]
+    if resume_from_checkpoint != "":
+        accelerator.load_state(resume_from_checkpoint)
+        accelerator.print(f"Resumed from local checkpoint: {resume_from_checkpoint}")
+    # Duration of the audio clips in seconds
+    best_loss = np.inf
+    length = config["training"]["max_audio_duration"]
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        total_loss, total_val_loss = 0, 0
+        for step, batch in enumerate(train_dataloader):
+            optimizer.zero_grad()
+            with accelerator.accumulate(model):
+                optimizer.zero_grad()
+                device = accelerator.device
+                text, audio_w, audio_l, duration, _ = batch
+                with torch.no_grad():
+                    audio_list_w = []
+                    audio_list_l = []
+                    for audio_path in audio_w:
+                        wav = read_wav_file(
+                            audio_path, length
+                        )  ## Only read the first 30 seconds of audio
+                        if (
+                            wav.shape[0] == 1
+                        ):  ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                            wav = wav.repeat(2, 1)
+                        audio_list_w.append(wav)
+                    for audio_path in audio_l:
+                        wav = read_wav_file(
+                            audio_path, length
+                        )  ## Only read the first 30 seconds of audio
+                        if (
+                            wav.shape[0] == 1
+                        ):  ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                            wav = wav.repeat(2, 1)
+                        audio_list_l.append(wav)
+                    audio_input_w = torch.stack(audio_list_w, dim=0).to(device)
+                    audio_input_l = torch.stack(audio_list_l, dim=0).to(device)
+                    # audio_input_ = audio_input.to(device)
+                    unwrapped_vae = accelerator.unwrap_model(vae)
+                    duration = torch.tensor(duration, device=device)
+                    duration = torch.clamp(
+                        duration, max=length
+                    )  ## max duration is 30 sec
+                    audio_latent_w = unwrapped_vae.encode(
+                        audio_input_w
+                    ).latent_dist.sample()
+                    audio_latent_l = unwrapped_vae.encode(
+                        audio_input_l
+                    ).latent_dist.sample()
+                    audio_latent = torch.cat((audio_latent_w, audio_latent_l), dim=0)
+                    audio_latent = audio_latent.transpose(
+                        1, 2
+                    )  ## Tranpose  to (bsz, seq_len, channel)
+                loss, raw_model_loss, raw_ref_loss, implicit_acc = model(
+                    audio_latent, text, duration=duration, sft=False
+                )
+                total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                # if accelerator.sync_gradients:
+                if accelerator.sync_gradients:
+                    # accelerator.clip_grad_value_(model.parameters(),1.0)
+                    progress_bar.update(1)
+                    completed_steps += 1
+            if completed_steps % 10 == 0 and accelerator.is_main_process:
+                total_norm = 0.0
+                for p in model.parameters():
+                    if p.grad is not None:
+                        param_norm = p.grad.data.norm(2)
+                        total_norm += param_norm.item() ** 2
+                total_norm = total_norm**0.5
+                logger.info(
+                    f"Step {completed_steps}, Loss: {loss.item()}, Grad Norm: {total_norm}"
+                )
+                lr = lr_scheduler.get_last_lr()[0]
+                result = {
+                    "train_loss": loss.item(),
+                    "grad_norm": total_norm,
+                    "learning_rate": lr,
+                    "raw_model_loss": raw_model_loss,
+                    "raw_ref_loss": raw_ref_loss,
+                    "implicit_acc": implicit_acc,
+                }
+                # result["val_loss"] = round(total_val_loss.item()/len(eval_dataloader), 4)
+                wandb.log(result, step=completed_steps)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if output_dir is not None:
+                        output_dir = os.path.join(output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+            if completed_steps >= args.max_train_steps:
+                break
+        model.eval()
+        eval_progress_bar = tqdm(
+            range(len(eval_dataloader)), disable=not accelerator.is_local_main_process
+        )
+        for step, batch in enumerate(eval_dataloader):
+            with accelerator.accumulate(model) and torch.no_grad():
+                device = model.device
+                text, audios, duration, _ = batch
+                audio_list = []
+                for audio_path in audios:
+                    wav = read_wav_file(
+                        audio_path, length
+                    )  ## Only read the first 30 seconds of audio
+                    if (
+                        wav.shape[0] == 1
+                    ):  ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                        wav = wav.repeat(2, 1)
+                    audio_list.append(wav)
+                audio_input = torch.stack(audio_list, dim=0)
+                audio_input = audio_input.to(device)
+                duration = torch.tensor(duration, device=device)
+                unwrapped_vae = accelerator.unwrap_model(vae)
+                audio_latent = unwrapped_vae.encode(audio_input).latent_dist.sample()
+                audio_latent = audio_latent.transpose(
+                    1, 2
+                )  ## Tranpose  to (bsz, seq_len, channel)
+                val_loss, _, _, _ = model(
+                    audio_latent, text, duration=duration, sft=True
+                )
+                total_val_loss += val_loss.detach().float()
+                eval_progress_bar.update(1)
+        if accelerator.is_main_process:
+            result = {}
+            result["epoch"] = float(epoch + 1)
+            result["epoch/train_loss"] = round(
+                total_loss.item() / len(train_dataloader), 4
+            )
+            result["epoch/val_loss"] = round(
+                total_val_loss.item() / len(eval_dataloader), 4
+            )
+            wandb.log(result, step=completed_steps)
+            with open("{}/summary.jsonl".format(output_dir), "a") as f:
+                f.write(json.dumps(result) + "\n\n")
+            logger.info(result)
+        save_checkpoint = True
+        accelerator.wait_for_everyone()
+        if accelerator.is_main_process and args.checkpointing_steps == "best":
+            if save_checkpoint:
+                accelerator.save_state("{}/{}".format(output_dir, "best"))
+            if (epoch + 1) % args.save_every == 0:
+                accelerator.save_state(
+                    "{}/{}".format(output_dir, "epoch_" + str(epoch + 1))
+                )
+        if accelerator.is_main_process and args.checkpointing_steps == "epoch":
+            accelerator.save_state(
+                "{}/{}".format(output_dir, "epoch_" + str(epoch + 1))
+            )
+if __name__ == "__main__":
+    main()

external_models/TangoFlux/tangoflux/utils.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import pandas as pd
+import torchaudio
+import random
+import itertools
+import numpy as np
+import numpy as np
+def normalize_wav(waveform):
+    waveform = waveform - torch.mean(waveform)
+    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+def pad_wav(waveform, segment_length):
+    waveform_length = len(waveform)
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    else:
+        padded_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
+        waveform = torch.cat([waveform, padded_wav])
+        return waveform
+def read_wav_file(filename, duration_sec):
+    info = torchaudio.info(filename)
+    sample_rate = info.sample_rate
+    # Calculate the number of frames corresponding to the desired duration
+    num_frames = int(sample_rate * duration_sec)
+    waveform, sr = torchaudio.load(filename, num_frames=num_frames)  # Faster!!!
+    if waveform.shape[0] == 2:  ## Stereo audio
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100)
+        resampled_waveform = resampler(waveform)
+        # print(resampled_waveform.shape)
+        padded_left = pad_wav(
+            resampled_waveform[0], int(44100 * duration_sec)
+        )  ## We pad left and right seperately
+        padded_right = pad_wav(resampled_waveform[1], int(44100 * duration_sec))
+        return torch.stack([padded_left, padded_right])
+    else:
+        waveform = torchaudio.functional.resample(
+            waveform, orig_freq=sr, new_freq=44100
+        )[0]
+        waveform = pad_wav(waveform, int(44100 * duration_sec)).unsqueeze(0)
+        return waveform
+class DPOText2AudioDataset(Dataset):
+    def __init__(
+        self,
+        dataset,
+        prefix,
+        text_column,
+        audio_w_column,
+        audio_l_column,
+        duration,
+        num_examples=-1,
+    ):
+        inputs = list(dataset[text_column])
+        self.inputs = [prefix + inp for inp in inputs]
+        self.audios_w = list(dataset[audio_w_column])
+        self.audios_l = list(dataset[audio_l_column])
+        self.durations = list(dataset[duration])
+        self.indices = list(range(len(self.inputs)))
+        self.mapper = {}
+        for index, audio_w, audio_l, duration, text in zip(
+            self.indices, self.audios_w, self.audios_l, self.durations, inputs
+        ):
+            self.mapper[index] = [audio_w, audio_l, duration, text]
+        if num_examples != -1:
+            self.inputs, self.audios_w, self.audios_l, self.durations = (
+                self.inputs[:num_examples],
+                self.audios_w[:num_examples],
+                self.audios_l[:num_examples],
+                self.durations[:num_examples],
+            )
+            self.indices = self.indices[:num_examples]
+    def __len__(self):
+        return len(self.inputs)
+    def get_num_instances(self):
+        return len(self.inputs)
+    def __getitem__(self, index):
+        s1, s2, s3, s4, s5 = (
+            self.inputs[index],
+            self.audios_w[index],
+            self.audios_l[index],
+            self.durations[index],
+            self.indices[index],
+        )
+        return s1, s2, s3, s4, s5
+    def collate_fn(self, data):
+        dat = pd.DataFrame(data)
+        return [dat[i].tolist() for i in dat]
+class Text2AudioDataset(Dataset):
+    def __init__(
+        self, dataset, prefix, text_column, audio_column, duration, num_examples=-1
+    ):
+        inputs = list(dataset[text_column])
+        self.inputs = [prefix + inp for inp in inputs]
+        self.audios = list(dataset[audio_column])
+        self.durations = list(dataset[duration])
+        self.indices = list(range(len(self.inputs)))
+        self.mapper = {}
+        for index, audio, duration, text in zip(
+            self.indices, self.audios, self.durations, inputs
+        ):
+            self.mapper[index] = [audio, text, duration]
+        if num_examples != -1:
+            self.inputs, self.audios, self.durations = (
+                self.inputs[:num_examples],
+                self.audios[:num_examples],
+                self.durations[:num_examples],
+            )
+            self.indices = self.indices[:num_examples]
+    def __len__(self):
+        return len(self.inputs)
+    def get_num_instances(self):
+        return len(self.inputs)
+    def __getitem__(self, index):
+        s1, s2, s3, s4 = (
+            self.inputs[index],
+            self.audios[index],
+            self.durations[index],
+            self.indices[index],
+        )
+        return s1, s2, s3, s4
+    def collate_fn(self, data):
+        dat = pd.DataFrame(data)
+        return [dat[i].tolist() for i in dat]

external_models/TangoFlux/train.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ CUDA_VISISBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' tangoflux/train.py --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml'

external_models/depth-fm/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*__pycache__*
+sandbox
+*.ckpt
+*-depth.png
+evaluation