Spaces:
Runtime error
Runtime error
File size: 5,465 Bytes
0c693cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
# %% auto 0
__all__ = ['data', 'audios', 'metadata', 'to_consider', 'processed_metadata', 'repo_id', 'learner', 'categories', 'title',
'description', 'mic', 'label', 'examples', 'intf', 'process_audio_exists', 'load_x', 'load_label_tfm',
'classify_audio']
# %% app.ipynb 1
import torch
import gradio as gr
from gradio import CSVLogger
from fastai.vision.all import *
import torchaudio
import torchaudio.transforms as T
import warnings
from huggingface_hub import from_pretrained_fastai
# %% app.ipynb 2
warnings.filterwarnings("ignore")
# %% app.ipynb 3
def process_audio_exists(audio):
slice_name = audio.name
# check if slice name exists in new metadata file
row = processed_metadata.loc[processed_metadata['slice_file_name'] == slice_name].index.any()
return row
# %% app.ipynb 4
data = Path('examples')
audios = get_files(data, extensions='.wav')
metadata = pd.read_csv('UrbanSound8K.csv')
to_consider = ['siren', 'street_music', 'children_playing', 'dog_bark', 'car_horn']
processed_metadata = metadata.loc[metadata['class'].isin(to_consider)]
processed_metadata.loc[processed_metadata['class'] == 'siren', 'classID'] = 4
processed_metadata.loc[processed_metadata['class'] == 'street_music', 'classID'] = 0
# %% app.ipynb 5
class load_x(Transform):
def __init__(self):
self.sr = 44100
self.max_ms = 4000
self.channels = 2
# self.transform = transform
def rechannel(self, waveform, sr):
if (waveform.shape[0] == self.channels):
# no rechanneling needed
return waveform, sr
if (self.channels==1):
# converting stereo to mono
# by selecting the first channel
new_waveform = waveform[:1,:]
elif (self.channels==2):
# converting mono to stereo
# by duplicating the first channel
new_waveform = torch.cat([waveform, waveform])
return new_waveform, sr
def resample(self, waveform, sr):
if (sr==self.sr):
# no resampling needed
return waveform, sr
num_channels = waveform.shape[0]
# resample first channel
new_waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform[:1,:])
if (num_channels) > 1:
# resample second channel and merge the two
re_two = torchaudio.transforms.Resample(sr, self.sr)(waveform[1:,:])
new_waveform = torch.cat([new_waveform, re_two])
return (new_waveform, self.sr)
def pad_trunc(self, waveform, sr):
num_channels, num_frames = waveform.shape
max_len = sr//1000 * self.max_ms
if (num_frames>max_len):
# truncate signal to given length
waveform = waveform[:,:max_len]
else:
# get padding lengths for beginning and end
begin_ln = random.randint(0, max_len-num_frames)
end_ln = max_len - num_frames - begin_ln
# pad the audio with zeros
pad_begin = torch.zeros((num_channels, begin_ln))
pad_end = torch.zeros((num_channels, end_ln))
waveform = torch.cat((pad_begin, waveform, pad_end), 1)
return (waveform, sr)
def mel_specgram(self, waveform, sr):
mel_tfm = T.MelSpectrogram(
sample_rate=sr,
n_fft=1024,
win_length=None,
hop_length=512,
center=True,
pad_mode="reflect",
power=2.0,
norm="slaney",
onesided=True,
n_mels=128,
mel_scale="htk")
spec = mel_tfm(waveform)
waveform = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec)
return waveform, sr
def encodes(self, x):
waveform, sr = torchaudio.load(x)
waveform, sr = self.resample(waveform, sr)
waveform, sr = self.pad_trunc(waveform, sr)
waveform, sr = self.rechannel(waveform, sr)
waveform, sr = self.mel_specgram(waveform, sr)
return waveform
class load_label_tfm(Transform):
def __init__(self, metadata=processed_metadata): self.metadata = metadata
def encodes(self, x):
return self.metadata.loc[self.metadata['slice_file_name'] == x.name]['class'].item()
# %% app.ipynb 6
repo_id = "Jimmie/urban8k"
learner = from_pretrained_fastai(repo_id)
# %% app.ipynb 14
categories = tuple(learner.dls.vocab)
def classify_audio(audio):
# use Path to open audio
audio_path = Path(audio)
pred,idx,probs = learner.predict(audio_path)
return dict(zip(categories, map(float, probs)))
# %% app.ipynb 16
title = "Environmental Sound Classification"
description = """
This demo showcases how AI can be used to recognize environmental sounds. It focuses specifically on 5 classes: car_horn, children_playing, dog_bark, siren and street music
When uploading audio, make sure it is in .wav format and is less than 4 seconds long.
Enjoy!
"""
mic = gr.Audio(source='upload', type="filepath", label='Upload Audio File here')
label = gr.outputs.Label()
examples = list(data.ls())
intf = gr.Interface(fn=classify_audio, inputs=mic, outputs=label, examples=examples,
title=title, description=description, cache_examples=False,
auto_submit_duration=5)
intf.launch(inline=False)
|