Ritesh-hf commited on
Commit
d32b059
·
verified ·
1 Parent(s): f8321e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -162
app.py CHANGED
@@ -1,162 +1,161 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import numpy as np
4
- import pandas as pd
5
- import re
6
- from pydub import AudioSegment
7
- from pydub.generators import Sine
8
- import io
9
- from scipy.signal import resample
10
-
11
- MODEL_NAME = "openai/whisper-tiny"
12
- BATCH_SIZE = 8
13
- # device = 0 if torch.cuda.is_available() else "cpu"
14
-
15
- pipe = pipeline(
16
- task="automatic-speech-recognition",
17
- model=MODEL_NAME,
18
- chunk_length_s=30,
19
- # device=device,
20
- )
21
-
22
- arabic_bad_Words = pd.read_csv("arabic_bad_words_dataset.csv")
23
- english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
24
-
25
-
26
- def clean_english_word(word):
27
- # Use regex to remove special characters, punctuation, and spaces around words
28
- cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)
29
- return cleaned_text
30
-
31
- def clean_arabic_word(word):
32
- # Define a regex pattern to match any non-Arabic letter character
33
- pattern = r'[^\u0600-\u06FF]'
34
- # Replace any character matching the pattern with an empty string
35
- cleaned_word = re.sub(pattern, '', word)
36
- return cleaned_word
37
-
38
- def classifier(word_list_with_timestamp, language):
39
-
40
- foul_words = []
41
- negative_timestamps = []
42
-
43
- if language == "English":
44
- list_to_search = set(english_bad_Words["words"])
45
- for item in word_list_with_timestamp:
46
- word = clean_english_word(item['text'])
47
- if word in list_to_search:
48
- foul_words.append(word)
49
- negative_timestamps.append(item['timestamp'])
50
- else:
51
- list_to_search = list(arabic_bad_Words["words"])
52
- for item in word_list_with_timestamp:
53
- word = clean_arabic_word(item['text'])
54
- for word_in_list in list_to_search:
55
- if word_in_list == word:
56
- foul_words.append(word)
57
- negative_timestamps.append(item['timestamp'])
58
- break
59
-
60
- return [foul_words, negative_timestamps]
61
-
62
- def generate_bleep(duration_ms, frequency=1000):
63
- sine_wave = Sine(frequency)
64
- bleep = sine_wave.to_audio_segment(duration=duration_ms)
65
- return bleep
66
-
67
- def mute_audio_range(audio_filepath, ranges, bleep_frequency=800):
68
- audio = AudioSegment.from_file(audio_filepath)
69
-
70
- for range in ranges:
71
- start_time = range[0] - 0.1
72
- end_time = range[-1] + 0.1
73
- start_ms = start_time * 1000 # pydub works with milliseconds
74
- end_ms = end_time * 1000
75
- duration_ms = end_ms - start_ms
76
-
77
- # Generate the bleep sound
78
- bleep_sound = generate_bleep(duration_ms, bleep_frequency)
79
-
80
- # Combine the original audio with the bleep sound
81
- audio = audio[:start_ms] + bleep_sound + audio[end_ms:]
82
-
83
- return audio
84
-
85
- def format_output_to_list(data):
86
- formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
87
- return formatted_list
88
-
89
- def transcribe(input_audio, audio_language, task, timestamp_type):
90
- if input_audio is None:
91
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
92
-
93
- if timestamp_type == "sentence":
94
- timestamp_type = True
95
- else:
96
- timestamp_type = "word"
97
-
98
- output = pipe(input_audio, batch_size=BATCH_SIZE, return_timestamps=timestamp_type, generate_kwargs={"task": task})
99
- text = output['text']
100
-
101
- timestamps = format_output_to_list(output['chunks'])
102
-
103
- foul_words, negative_timestamps = classifier(output['chunks'], audio_language)
104
- foul_words = ", ".join(foul_words)
105
-
106
-
107
- audio_output = mute_audio_range(input_audio, negative_timestamps)
108
- # Save the output audio to a BytesIO object
109
- output_buffer = io.BytesIO()
110
- audio_output.export(output_buffer, format="wav")
111
- output_buffer.seek(0)
112
-
113
- # Read the audio data from the BytesIO buffer
114
- sample_rate = audio_output.frame_rate
115
- audio_data = np.frombuffer(output_buffer.read(), dtype=np.int16)
116
-
117
-
118
- return [text, timestamps, foul_words, (sample_rate, audio_data)]
119
-
120
- examples = [
121
- ["arabic_english_audios/audios/arabic_audio_1.wav", 'Arabic', 'transcribe', 'word'],
122
- ["arabic_english_audios/audios/arabic_audio_2.wav", 'Arabic', 'transcribe', 'word'],
123
- ["arabic_english_audios/audios/arabic_audio_3.wav", 'Arabic', 'transcribe', 'word'],
124
- ["arabic_english_audios/audios/arabic_audio_4.wav", 'Arabic', 'transcribe', 'word'],
125
- ["arabic_english_audios/audios/arabic_hate_audio_1.mp3", 'Arabic', 'transcribe', 'word'],
126
- ["arabic_english_audios/audios/arabic_hate_audio_2.mp3", 'Arabic', 'transcribe', 'word'],
127
- ["arabic_english_audios/audios/arabic_hate_audio_3.mp3", 'Arabic', 'transcribe', 'word'],
128
- ["arabic_english_audios/audios/english_audio_1.wav", 'English', 'transcribe', 'word'],
129
- ["arabic_english_audios/audios/english_audio_2.mp3", 'English', 'transcribe', 'word'],
130
- ["arabic_english_audios/audios/english_audio_3.mp3", 'English', 'transcribe', 'word'],
131
- ["arabic_english_audios/audios/english_audio_4.mp3", 'English', 'transcribe', 'word'],
132
- ["arabic_english_audios/audios/english_audio_5.mp3", 'English', 'transcribe', 'word'],
133
- ["arabic_english_audios/audios/english_audio_6.wav", 'English', 'transcribe', 'word']
134
- ]
135
-
136
- with gr.Blocks(theme=gr.themes.Default()) as demo:
137
- gr.HTML("<h2 style='text-align: center;'>Transcribing Audio with Timestamps using whisper-large-v3</h2>")
138
- # gr.Markdown("")
139
- with gr.Row():
140
- with gr.Column():
141
- audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio file")
142
- audio_language = gr.Radio(["Arabic", "English"], label="Audio Language")
143
- task = gr.Radio(["transcribe", "translate"], label="Task")
144
- timestamp_type = gr.Radio(["sentence", "word"], label="Timestamp Type")
145
- with gr.Row():
146
- clear_button = gr.ClearButton(value="Clear")
147
- submit_button = gr.Button("Submit", variant="primary", )
148
-
149
- with gr.Column():
150
- transcript_output = gr.Text(label="Transcript")
151
- timestamp_output = gr.Text(label="Timestamps")
152
- foul_words = gr.Text(label="Foul Words")
153
- output_audio = gr.Audio(label="Output Audio", type="numpy")
154
-
155
- examples = gr.Examples(examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=20)
156
-
157
- submit_button.click(fn=transcribe, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio])
158
- clear_button.add([audio_input, audio_language, task, timestamp_type, transcript_output, timestamp_output, foul_words, output_audio])
159
-
160
-
161
- if __name__ == "__main__":
162
- demo.launch()
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ import pandas as pd
5
+ import re
6
+ from pydub import AudioSegment
7
+ from pydub.generators import Sine
8
+ import io
9
+ from scipy.signal import resample
10
+
11
+ MODEL_NAME = "openai/whisper-tiny"
12
+ BATCH_SIZE = 8
13
+ # device = 0 if torch.cuda.is_available() else "cpu"
14
+
15
+ pipe = pipeline(
16
+ task="automatic-speech-recognition",
17
+ model=MODEL_NAME,
18
+ chunk_length_s=30,
19
+ # device=device,
20
+ )
21
+
22
+ arabic_bad_Words = pd.read_csv("arabic_bad_words_dataset.csv")
23
+ english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
24
+
25
+
26
+ def clean_english_word(word):
27
+ # Use regex to remove special characters, punctuation, and spaces around words
28
+ cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)
29
+ return cleaned_text
30
+
31
+ def clean_arabic_word(word):
32
+ # Define a regex pattern to match any non-Arabic letter character
33
+ pattern = r'[^\u0600-\u06FF]'
34
+ # Replace any character matching the pattern with an empty string
35
+ cleaned_word = re.sub(pattern, '', word)
36
+ return cleaned_word
37
+
38
+ def classifier(word_list_with_timestamp, language):
39
+
40
+ foul_words = []
41
+ negative_timestamps = []
42
+
43
+ if language == "English":
44
+ list_to_search = set(english_bad_Words["words"])
45
+ for item in word_list_with_timestamp:
46
+ word = clean_english_word(item['text'])
47
+ if word in list_to_search:
48
+ foul_words.append(word)
49
+ negative_timestamps.append(item['timestamp'])
50
+ else:
51
+ list_to_search = list(arabic_bad_Words["words"])
52
+ for item in word_list_with_timestamp:
53
+ word = clean_arabic_word(item['text'])
54
+ for word_in_list in list_to_search:
55
+ if word_in_list == word:
56
+ foul_words.append(word)
57
+ negative_timestamps.append(item['timestamp'])
58
+ break
59
+
60
+ return [foul_words, negative_timestamps]
61
+
62
+ def generate_bleep(duration_ms, frequency=1000):
63
+ sine_wave = Sine(frequency)
64
+ bleep = sine_wave.to_audio_segment(duration=duration_ms)
65
+ return bleep
66
+
67
+ def mute_audio_range(audio_filepath, ranges, bleep_frequency=800):
68
+ audio = AudioSegment.from_file(audio_filepath)
69
+
70
+ for range in ranges:
71
+ start_time = range[0] - 0.1
72
+ end_time = range[-1] + 0.1
73
+ start_ms = start_time * 1000 # pydub works with milliseconds
74
+ end_ms = end_time * 1000
75
+ duration_ms = end_ms - start_ms
76
+
77
+ # Generate the bleep sound
78
+ bleep_sound = generate_bleep(duration_ms, bleep_frequency)
79
+
80
+ # Combine the original audio with the bleep sound
81
+ audio = audio[:start_ms] + bleep_sound + audio[end_ms:]
82
+
83
+ return audio
84
+
85
+ def format_output_to_list(data):
86
+ formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
87
+ return formatted_list
88
+
89
+ def transcribe(input_audio, audio_language, task, timestamp_type):
90
+ if input_audio is None:
91
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
92
+
93
+ if timestamp_type == "sentence":
94
+ timestamp_type = True
95
+ else:
96
+ timestamp_type = "word"
97
+
98
+ output = pipe(input_audio, batch_size=BATCH_SIZE, return_timestamps=timestamp_type, generate_kwargs={"task": task})
99
+ text = output['text']
100
+
101
+ timestamps = format_output_to_list(output['chunks'])
102
+
103
+ foul_words, negative_timestamps = classifier(output['chunks'], audio_language)
104
+ foul_words = ", ".join(foul_words)
105
+
106
+
107
+ audio_output = mute_audio_range(input_audio, negative_timestamps)
108
+ # Save the output audio to a BytesIO object
109
+ output_buffer = io.BytesIO()
110
+ audio_output.export(output_buffer, format="wav")
111
+ output_buffer.seek(0)
112
+
113
+ # Read the audio data from the BytesIO buffer
114
+ sample_rate = audio_output.frame_rate
115
+ audio_data = np.frombuffer(output_buffer.read(), dtype=np.int16)
116
+
117
+
118
+ return [text, timestamps, foul_words, (sample_rate, audio_data)]
119
+
120
+ examples = [
121
+ ["arabic_english_audios/audios/arabic_audio_1.wav", 'Arabic', 'transcribe', 'word'],
122
+ ["arabic_english_audios/audios/arabic_audio_2.wav", 'Arabic', 'transcribe', 'word'],
123
+ ["arabic_english_audios/audios/arabic_audio_3.wav", 'Arabic', 'transcribe', 'word'],
124
+ ["arabic_english_audios/audios/arabic_hate_audio_1.mp3", 'Arabic', 'transcribe', 'word'],
125
+ ["arabic_english_audios/audios/arabic_hate_audio_2.flac", 'Arabic', 'transcribe', 'word'],
126
+ ["arabic_english_audios/audios/arabic_hate_audio_3.mp3", 'Arabic', 'transcribe', 'word'],
127
+ ["arabic_english_audios/audios/english_audio_1.wav", 'English', 'transcribe', 'word'],
128
+ ["arabic_english_audios/audios/english_audio_2.mp3", 'English', 'transcribe', 'word'],
129
+ ["arabic_english_audios/audios/english_audio_3.mp3", 'English', 'transcribe', 'word'],
130
+ ["arabic_english_audios/audios/english_audio_4.mp3", 'English', 'transcribe', 'word'],
131
+ ["arabic_english_audios/audios/english_audio_5.mp3", 'English', 'transcribe', 'word'],
132
+ ["arabic_english_audios/audios/english_audio_6.wav", 'English', 'transcribe', 'word']
133
+ ]
134
+
135
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
136
+ gr.HTML("<h2 style='text-align: center;'>Transcribing Audio with Timestamps using whisper-large-v3</h2>")
137
+ # gr.Markdown("")
138
+ with gr.Row():
139
+ with gr.Column():
140
+ audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio file")
141
+ audio_language = gr.Radio(["Arabic", "English"], label="Audio Language")
142
+ task = gr.Radio(["transcribe", "translate"], label="Task")
143
+ timestamp_type = gr.Radio(["sentence", "word"], label="Timestamp Type")
144
+ with gr.Row():
145
+ clear_button = gr.ClearButton(value="Clear")
146
+ submit_button = gr.Button("Submit", variant="primary", )
147
+
148
+ with gr.Column():
149
+ transcript_output = gr.Text(label="Transcript")
150
+ timestamp_output = gr.Text(label="Timestamps")
151
+ foul_words = gr.Text(label="Foul Words")
152
+ output_audio = gr.Audio(label="Output Audio", type="numpy")
153
+
154
+ examples = gr.Examples(examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=20)
155
+
156
+ submit_button.click(fn=transcribe, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio])
157
+ clear_button.add([audio_input, audio_language, task, timestamp_type, transcript_output, timestamp_output, foul_words, output_audio])
158
+
159
+
160
+ if __name__ == "__main__":
161
+ demo.launch()