speech-enhancement

Running

File size: 2,606 Bytes

bd4c34d
e1aa7a1
2dd8e14
 
bd4c34d
2dd8e14
bd4c34d
 
 
 
 
 
 
 
2dd8e14
bd4c34d
78726b4
bd4c34d
 
78726b4
 
bd4c34d
 
 
2dd8e14
bd4c34d
3be2192
d6a6af9
 
 
 
 
bd4c34d
 
 
 
0a3d769
73784e2
bd4c34d
0a3d769
 
bd4c34d
d6a6af9
bd4c34d
 
 
 
 
 
 
0a3d769
bd4c34d
11e2872
d6a6af9
bd4c34d
d6a6af9
2dd8e14
bd4c34d

import os
from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio
import gradio as gr
from moviepy.editor import VideoFileClip

def convert_video_to_audio(video_input):
    video_clip = VideoFileClip(video_input)
    audio_clip = video_clip.audio
    audio_clip_filepath = os.path.normpath(f"{video_input.split('.')[0]}.m4a")
    audio_clip.write_audiofile(audio_clip_filepath, codec='aac')
    audio_clip.close()
    video_clip.close()
    return audio_clip_filepath

def speechbrain(input_obj, input_obj_type):
    model = separator.from_hparams(source="speechbrain/sepformer-whamr-enhancement", savedir='pretrained_models/sepformer-whamr-enhancement')
    if input_obj_type == "video":
        aud = convert_video_to_audio(input_obj)
    else:
        aud = input_obj
    est_sources = model.separate_file(path=aud)
    torchaudio.save("clean_audio_file.wav", est_sources[:, :, 0].detach().cpu(), 8000)
    return "clean_audio_file.wav"

def main():
    with gr.Blocks(title="Speech Enhancement", delete_cache=(86400, 86400), theme=gr.themes.Ocean()) as demo:
        gr.Markdown("Gradio demo for Speech Enhancement by SpeechBrain. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below.")
        gr.Markdown("<p style='text-align: center'><a href='https://arxiv.org/abs/2010.13154' target='_blank'>Attention is All You Need in Speech Separation</a> | <a href='https://github.com/speechbrain/speechbrain/tree/develop/templates/enhancement' '_blank'>Github Repo</a></p>")
        # examples = [
        #     ['samples_audio_samples_test_mixture.wav']
        # ]
        with gr.Tabs(selected="video") as tabs:
            with gr.Tab("Video", id="video"):
                gr.Interface(
                  fn=speechbrain,
                  inputs= [
                      gr.Video(),
                      gr.Radio(choices=["video"], value="video", label="File Type")
                  ],
                  outputs= [
                    gr.Audio(label="Output Audio", type="filepath")
                  ]
                )
            with gr.Tab("Audio", id="audio"):
                gr.Interface(
                  fn=speechbrain,
                  inputs=[
                      gr.Audio(type="filepath"),
                      gr.Radio(choices=["audio"], value="audio", label="File Type")
                  ],
                  outputs=[
                    gr.Audio(label="Output Audio", type="filepath")
                  ]
                )
        demo.launch()

if __name__ == '__main__':
    main()