Spaces:

DennisHung
/

DiffmorpherXAp-adapter

Runtime error

File size: 6,918 Bytes

import os
import gc
import torch
import shutil
import atexit
import torchaudio
import numpy as np
import gradio as gr
from pipeline.morph_pipeline_successed_ver1 import AudioLDM2MorphPipeline 
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
# Initialize AudioLDM2 Pipeline
torch.cuda.set_device(0)
dtype = torch.float32
pipeline = AudioLDM2MorphPipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=dtype)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)


def morph_audio(audio_file1, audio_file2, num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
    save_lora_dir = "output"
    if os.path.exists(save_lora_dir):
        shutil.rmtree(save_lora_dir)
    os.makedirs(save_lora_dir, exist_ok=True)
    
    # Load audio and compute duration
    waveform1, sample_rate1 = torchaudio.load(audio_file1)
    duration1 = waveform1.shape[1] / sample_rate1
    waveform2, sample_rate2 = torchaudio.load(audio_file2)
    duration2 = waveform2.shape[1] / sample_rate2

    # Compare durations and take the shorter one
    duration = int(min(duration1, duration2))
    
    # Perform morphing using the pipeline
    _ = pipeline(
        dtype = dtype,
        audio_file=audio_file1,
        audio_file2=audio_file2,
        audio_length_in_s=duration,
        time_pooling=2,
        freq_pooling=2,
        prompt_1=prompt1,
        prompt_2=prompt2,
        negative_prompt_1=negative_prompt1,
        negative_prompt_2=negative_prompt2,
        save_lora_dir=save_lora_dir,
        use_adain=True,
        use_reschedule=False,
        num_inference_steps=num_inference_steps,
        lamd=0.6,
        output_path=save_lora_dir,
        num_frames=5,
        fix_lora=None,
        use_lora=True,
        lora_steps=2,
        noisy_latent_with_lora=True,
        morphing_with_lora=True,
        use_morph_prompt=True,
        guidance_scale=7.5,
    )
    
    # Collect the output file paths
    output_paths = sorted(
    [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")],
    key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
    )
    del waveform1, waveform2, _
    torch.cuda.empty_cache()
    gc.collect()


    return output_paths

def morph_audio_with_morphing_factor(audio_file1, audio_file2, alpha,num_inference_steps, prompt1='', prompt2='', negative_prompt1="Low quality", negative_prompt2="Low quality"):
    save_lora_dir = "output"
    if os.path.exists(save_lora_dir):
        shutil.rmtree(save_lora_dir)
    os.makedirs(save_lora_dir, exist_ok=True)
    
    # Load audio and compute duration
    waveform1, sample_rate1 = torchaudio.load(audio_file1)
    duration1 = waveform1.shape[1] / sample_rate1
    waveform2, sample_rate2 = torchaudio.load(audio_file2)
    duration2 = waveform2.shape[1] / sample_rate2

    # Compare durations and take the shorter one
    duration = int(min(duration1, duration2))
    try:
        # Perform morphing using the pipeline
        _ = pipeline(
            dtype = dtype,
            morphing_factor = alpha,
            audio_file=audio_file1,
            audio_file2=audio_file2,
            audio_length_in_s=duration,
            time_pooling=2,
            freq_pooling=2,
            prompt_1=prompt1,
            prompt_2=prompt2,
            negative_prompt_1=negative_prompt1,
            negative_prompt_2=negative_prompt2,
            save_lora_dir=save_lora_dir,
            use_adain=True,
            use_reschedule=False,
            num_inference_steps=num_inference_steps,
            lamd=0.6,
            output_path=save_lora_dir,
            num_frames=5,
            fix_lora=None,
            use_lora=True,
            lora_steps=2,
            noisy_latent_with_lora=True,
            morphing_with_lora=True,
            use_morph_prompt=True,
            guidance_scale=7.5,
        )
        output_paths = os.path.join(save_lora_dir, 'interpolated.wav')
    
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print("CUDA out of memory. Releasing unused memory...")
            torch.cuda.empty_cache()
            gc.collect()
            raise e
    # # Collect the output file paths
    # del waveform1, waveform2, _
    # torch.cuda.empty_cache()
    # gc.collect()
    
    return output_paths

def cleanup_output_dir():
    save_lora_dir = "output"
    if os.path.exists(save_lora_dir):
        shutil.rmtree(save_lora_dir)
        print(f"Cleaned up directory: {save_lora_dir}")
atexit.register(cleanup_output_dir)

# Gradio interface function
def interface(audio1, audio2, alpha, num_inference_steps):
    output_paths = morph_audio_with_morphing_factor(audio1, audio2, alpha, num_inference_steps)
    return output_paths

# Gradio Interface
# demo = gr.Interface(
#     fn=interface,
#     inputs=[
#         gr.Audio(label="Upload Audio File 1", type="filepath"),
#         gr.Audio(label="Upload Audio File 2", type="filepath"),
#         gr.Slider(0, 1, step=0.01, label="Interpolation Alpha"),
#         gr.Slider(10, 50, step=1, label="Inference Steps"),
#         # gr.Textbox(label="Prompt for Audio File 1"),
#         # gr.Textbox(label="Prompt for Audio File 2"),
#     ],
#     outputs=gr.Audio(label="Interpolated Audio")
# )


with gr.Blocks() as demo:
    with gr.Tab("Sound Morphing with fixed frames."):
        gr.Markdown("### Upload two audio files for morphing")
        with gr.Row():
            audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
            audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
            num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
        outputs = [
            gr.Audio(label="Morphing audio 1"),
            gr.Audio(label="Morphing audio 2"),
            gr.Audio(label="Morphing audio 3"),
            gr.Audio(label="Morphing audio 4"),
            gr.Audio(label="Morphing audio 5"),
        ]
        submit_btn1 = gr.Button("Submit")
        submit_btn1.click(morph_audio, inputs=[audio1, audio2, num_inference_steps], outputs=outputs)
    
    with gr.Tab("Sound Morphing with specified morphing factor."):
        gr.Markdown("### Upload two audio files for morphing")
        with gr.Row():
            audio1 = gr.Audio(label="Upload Audio File 1", type="filepath")
            audio2 = gr.Audio(label="Upload Audio File 2", type="filepath")
            alpha = gr.Slider(0, 1, step=0.01, label="Interpolation Alpha")
            num_inference_steps = gr.Slider(10, 50, step=1, label="Inference Steps", value=50)
        outputs=gr.Audio(label="Interpolated Audio")
        submit_btn2 = gr.Button("Submit")
        submit_btn2.click(morph_audio_with_morphing_factor, inputs=[audio1, audio2, alpha, num_inference_steps], outputs=outputs)

if __name__ == "__main__":
    demo.launch(share=True)