Update app.py
Browse files
app.py
CHANGED
@@ -6,15 +6,15 @@ import re
|
|
6 |
from pydub import AudioSegment
|
7 |
from pydub.generators import Sine
|
8 |
import io
|
9 |
-
import torch
|
10 |
|
11 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
-
model_id = "openai/whisper-large-v3"
|
13 |
|
14 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
15 |
model_id, low_cpu_mem_usage=True, use_safetensors=True
|
16 |
)
|
17 |
-
model.to(device)
|
18 |
|
19 |
processor = AutoProcessor.from_pretrained(model_id)
|
20 |
|
@@ -26,7 +26,7 @@ pipe = pipeline(
|
|
26 |
max_new_tokens=128,
|
27 |
chunk_length_s=30,
|
28 |
batch_size=8,
|
29 |
-
device=device,
|
30 |
)
|
31 |
|
32 |
|
@@ -55,7 +55,6 @@ def classifier(word_list_with_timestamp, language):
|
|
55 |
if word.lower() in list_to_search:
|
56 |
foul_words.append(word)
|
57 |
negative_timestamps.append(item['timestamp'])
|
58 |
-
break
|
59 |
else:
|
60 |
list_to_search = list(arabic_bad_Words["words"])
|
61 |
for item in word_list_with_timestamp:
|
@@ -63,8 +62,7 @@ def classifier(word_list_with_timestamp, language):
|
|
63 |
for word_in_list in list_to_search:
|
64 |
if word_in_list == word:
|
65 |
foul_words.append(word)
|
66 |
-
negative_timestamps.append(item['timestamp'])
|
67 |
-
break
|
68 |
|
69 |
return [foul_words, negative_timestamps]
|
70 |
|
@@ -76,8 +74,8 @@ def generate_bleep(duration_ms, frequency=1000):
|
|
76 |
def mute_audio_range(audio_filepath, ranges, bleep_frequency=800):
|
77 |
audio = AudioSegment.from_file(audio_filepath)
|
78 |
for range in ranges:
|
79 |
-
start_time = range[0]
|
80 |
-
end_time = range[-1]
|
81 |
start_ms = start_time * 1000 # pydub works with milliseconds
|
82 |
end_ms = end_time * 1000
|
83 |
duration_ms = end_ms - start_ms
|
@@ -90,6 +88,9 @@ def mute_audio_range(audio_filepath, ranges, bleep_frequency=800):
|
|
90 |
|
91 |
return audio
|
92 |
|
|
|
|
|
|
|
93 |
def format_output_to_list(data):
|
94 |
formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
|
95 |
return formatted_list
|
@@ -113,6 +114,10 @@ def transcribe(input_audio, audio_language, task, timestamp_type):
|
|
113 |
|
114 |
|
115 |
audio_output = mute_audio_range(input_audio, negative_timestamps)
|
|
|
|
|
|
|
|
|
116 |
# Save the output audio to a BytesIO object
|
117 |
output_buffer = io.BytesIO()
|
118 |
audio_output.export(output_buffer, format="wav")
|
@@ -122,16 +127,26 @@ def transcribe(input_audio, audio_language, task, timestamp_type):
|
|
122 |
sample_rate = audio_output.frame_rate
|
123 |
audio_data = np.frombuffer(output_buffer.read(), dtype=np.int16)
|
124 |
|
125 |
-
|
126 |
return [text, timestamps, foul_words, (sample_rate, audio_data)]
|
127 |
|
128 |
examples = [
|
129 |
-
["arabic_english_audios/audios/
|
130 |
-
["arabic_english_audios/audios/
|
131 |
-
["arabic_english_audios/audios/
|
132 |
-
["arabic_english_audios/audios/arabic_audio_31.mp3", 'Arabic', 'transcribe', 'word'],
|
133 |
-
["arabic_english_audios/audios/arabic_audio_32.mp3", 'Arabic', 'transcribe', 'word'],
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
["arabic_english_audios/audios/english_audio_31.mp3", 'English', 'transcribe', 'word'],
|
136 |
["arabic_english_audios/audios/english_audio_32.mp3", 'English', 'transcribe', 'word'],
|
137 |
["arabic_english_audios/audios/english_audio_33.mp3", 'English', 'transcribe', 'word'],
|
@@ -147,9 +162,6 @@ examples = [
|
|
147 |
["arabic_english_audios/audios/english_audio_43.mp3", 'English', 'transcribe', 'word'],
|
148 |
["arabic_english_audios/audios/english_audio_44.mp3", 'English', 'transcribe', 'word'],
|
149 |
["arabic_english_audios/audios/english_audio_45.mp3", 'English', 'transcribe', 'word'],
|
150 |
-
["arabic_english_audios/audios/english_audio_46.mp3", 'English', 'transcribe', 'word'],
|
151 |
-
["arabic_english_audios/audios/english_audio_48.mp3", 'English', 'transcribe', 'word'],
|
152 |
-
["arabic_english_audios/audios/english_audio_49.mp3", 'English', 'transcribe', 'word'],
|
153 |
]
|
154 |
|
155 |
with gr.Blocks(theme=gr.themes.Default()) as demo:
|
@@ -164,14 +176,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|
164 |
with gr.Row():
|
165 |
clear_button = gr.ClearButton(value="Clear")
|
166 |
submit_button = gr.Button("Submit", variant="primary", )
|
167 |
-
|
168 |
with gr.Column():
|
169 |
transcript_output = gr.Text(label="Transcript")
|
170 |
timestamp_output = gr.Text(label="Timestamps")
|
171 |
foul_words = gr.Text(label="Foul Words")
|
172 |
output_audio = gr.Audio(label="Output Audio", type="numpy")
|
173 |
|
174 |
-
examples = gr.Examples(examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=
|
175 |
|
176 |
submit_button.click(fn=transcribe, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio])
|
177 |
clear_button.add([audio_input, audio_language, task, timestamp_type, transcript_output, timestamp_output, foul_words, output_audio])
|
|
|
6 |
from pydub import AudioSegment
|
7 |
from pydub.generators import Sine
|
8 |
import io
|
9 |
+
# import torch
|
10 |
|
11 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
+
model_id = "openai/whisper-whisper-large-v3"
|
13 |
|
14 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
15 |
model_id, low_cpu_mem_usage=True, use_safetensors=True
|
16 |
)
|
17 |
+
# model.to(device)
|
18 |
|
19 |
processor = AutoProcessor.from_pretrained(model_id)
|
20 |
|
|
|
26 |
max_new_tokens=128,
|
27 |
chunk_length_s=30,
|
28 |
batch_size=8,
|
29 |
+
# device=device,
|
30 |
)
|
31 |
|
32 |
|
|
|
55 |
if word.lower() in list_to_search:
|
56 |
foul_words.append(word)
|
57 |
negative_timestamps.append(item['timestamp'])
|
|
|
58 |
else:
|
59 |
list_to_search = list(arabic_bad_Words["words"])
|
60 |
for item in word_list_with_timestamp:
|
|
|
62 |
for word_in_list in list_to_search:
|
63 |
if word_in_list == word:
|
64 |
foul_words.append(word)
|
65 |
+
negative_timestamps.append(item['timestamp'])
|
|
|
66 |
|
67 |
return [foul_words, negative_timestamps]
|
68 |
|
|
|
74 |
def mute_audio_range(audio_filepath, ranges, bleep_frequency=800):
|
75 |
audio = AudioSegment.from_file(audio_filepath)
|
76 |
for range in ranges:
|
77 |
+
start_time = range[0]
|
78 |
+
end_time = range[-1]
|
79 |
start_ms = start_time * 1000 # pydub works with milliseconds
|
80 |
end_ms = end_time * 1000
|
81 |
duration_ms = end_ms - start_ms
|
|
|
88 |
|
89 |
return audio
|
90 |
|
91 |
+
def resample_audio(audio_segment, target_sample_rate=16000):
|
92 |
+
return audio_segment.set_frame_rate(target_sample_rate).set_channels(1).set_sample_width(2)
|
93 |
+
|
94 |
def format_output_to_list(data):
|
95 |
formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
|
96 |
return formatted_list
|
|
|
114 |
|
115 |
|
116 |
audio_output = mute_audio_range(input_audio, negative_timestamps)
|
117 |
+
|
118 |
+
# Resample the output audio to 16kHz
|
119 |
+
audio_output = resample_audio(audio_output, 16000)
|
120 |
+
|
121 |
# Save the output audio to a BytesIO object
|
122 |
output_buffer = io.BytesIO()
|
123 |
audio_output.export(output_buffer, format="wav")
|
|
|
127 |
sample_rate = audio_output.frame_rate
|
128 |
audio_data = np.frombuffer(output_buffer.read(), dtype=np.int16)
|
129 |
|
|
|
130 |
return [text, timestamps, foul_words, (sample_rate, audio_data)]
|
131 |
|
132 |
examples = [
|
133 |
+
["arabic_english_audios/audios/arabic_audio_11.mp3", 'Arabic', 'transcribe', 'word'],
|
134 |
+
["arabic_english_audios/audios/arabic_audio_12.mp3", 'Arabic', 'transcribe', 'word'],
|
135 |
+
["arabic_english_audios/audios/arabic_audio_13.mp3", 'Arabic', 'transcribe', 'word'],
|
|
|
|
|
136 |
|
137 |
+
["arabic_english_audios/audios/english_audio_18.mp3", 'English', 'transcribe', 'word'],
|
138 |
+
["arabic_english_audios/audios/english_audio_19.mp3", 'English', 'transcribe', 'word'],
|
139 |
+
["arabic_english_audios/audios/english_audio_20.mp3", 'English', 'transcribe', 'word'],
|
140 |
+
["arabic_english_audios/audios/english_audio_21.mp3", 'English', 'transcribe', 'word'],
|
141 |
+
["arabic_english_audios/audios/english_audio_22.mp3", 'English', 'transcribe', 'word'],
|
142 |
+
["arabic_english_audios/audios/english_audio_23.mp3", 'English', 'transcribe', 'word'],
|
143 |
+
["arabic_english_audios/audios/english_audio_24.mp3", 'English', 'transcribe', 'word'],
|
144 |
+
["arabic_english_audios/audios/english_audio_25.mp3", 'English', 'transcribe', 'word'],
|
145 |
+
["arabic_english_audios/audios/english_audio_26.mp3", 'English', 'transcribe', 'word'],
|
146 |
+
["arabic_english_audios/audios/english_audio_27.mp3", 'English', 'transcribe', 'word'],
|
147 |
+
["arabic_english_audios/audios/english_audio_28.mp3", 'English', 'transcribe', 'word'],
|
148 |
+
["arabic_english_audios/audios/english_audio_29.mp3", 'English', 'transcribe', 'word'],
|
149 |
+
["arabic_english_audios/audios/english_audio_30.mp3", 'English', 'transcribe', 'word'],
|
150 |
["arabic_english_audios/audios/english_audio_31.mp3", 'English', 'transcribe', 'word'],
|
151 |
["arabic_english_audios/audios/english_audio_32.mp3", 'English', 'transcribe', 'word'],
|
152 |
["arabic_english_audios/audios/english_audio_33.mp3", 'English', 'transcribe', 'word'],
|
|
|
162 |
["arabic_english_audios/audios/english_audio_43.mp3", 'English', 'transcribe', 'word'],
|
163 |
["arabic_english_audios/audios/english_audio_44.mp3", 'English', 'transcribe', 'word'],
|
164 |
["arabic_english_audios/audios/english_audio_45.mp3", 'English', 'transcribe', 'word'],
|
|
|
|
|
|
|
165 |
]
|
166 |
|
167 |
with gr.Blocks(theme=gr.themes.Default()) as demo:
|
|
|
176 |
with gr.Row():
|
177 |
clear_button = gr.ClearButton(value="Clear")
|
178 |
submit_button = gr.Button("Submit", variant="primary", )
|
179 |
+
|
180 |
with gr.Column():
|
181 |
transcript_output = gr.Text(label="Transcript")
|
182 |
timestamp_output = gr.Text(label="Timestamps")
|
183 |
foul_words = gr.Text(label="Foul Words")
|
184 |
output_audio = gr.Audio(label="Output Audio", type="numpy")
|
185 |
|
186 |
+
examples = gr.Examples(examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=50, cache_examples=False)
|
187 |
|
188 |
submit_button.click(fn=transcribe, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio])
|
189 |
clear_button.add([audio_input, audio_language, task, timestamp_type, transcript_output, timestamp_output, foul_words, output_audio])
|