import os from speechbrain.inference.separation import SepformerSeparation as separator import torchaudio import gradio as gr from moviepy.editor import VideoFileClip def convert_video_to_audio(video_input): video_clip = VideoFileClip(video_input) audio_clip = video_clip.audio audio_clip_filepath = os.path.normpath(f"{video_input.split('.')[0]}.m4a") audio_clip.write_audiofile(audio_clip_filepath, codec='aac') audio_clip.close() video_clip.close() return audio_clip_filepath def speechbrain(input_obj, input_obj_type): model = separator.from_hparams(source="speechbrain/sepformer-whamr-enhancement", savedir='pretrained_models/sepformer-whamr-enhancement') if input_obj_type == "video": aud = convert_video_to_audio(input_obj) else: aud = input_obj est_sources = model.separate_file(path=aud) torchaudio.save("clean_audio_file.wav", est_sources[:, :, 0].detach().cpu(), 8000) return "clean_audio_file.wav" def main(): with gr.Blocks(title="Speech Enhancement", delete_cache=(86400, 86400), theme=gr.themes.Ocean()) as demo: gr.Markdown("Gradio demo for Speech Enhancement by SpeechBrain. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below.") gr.Markdown("
Attention is All You Need in Speech Separation | Github Repo
") # examples = [ # ['samples_audio_samples_test_mixture.wav'] # ] with gr.Tabs(selected="video") as tabs: with gr.Tab("Video", id="video"): gr.Interface( fn=speechbrain, inputs= [ gr.Video(), gr.Radio(choices=["video"], value="video", label="File Type") ], outputs= [ gr.Audio(label="Output Audio", type="filepath") ] ) with gr.Tab("Audio", id="audio"): gr.Interface( fn=speechbrain, inputs=[ gr.Audio(type="filepath"), gr.Radio(choices=["audio"], value="audio", label="File Type") ], outputs=[ gr.Audio(label="Output Audio", type="filepath") ] ) demo.launch() if __name__ == '__main__': main()