Ritesh-hf commited on
Commit
d11ac3c
·
verified ·
1 Parent(s): 6f07ffc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -43
app.py CHANGED
@@ -1,20 +1,25 @@
1
  import gradio as gr
2
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
- import numpy as np
4
- import pandas as pd
5
- import re
6
  from pydub import AudioSegment
7
  from pydub.generators import Sine
8
  import io
9
- # import torch
 
 
 
 
 
 
 
 
10
 
11
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
- model_id = "openai/whisper-large-v3"
13
 
14
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
15
  model_id, low_cpu_mem_usage=True, use_safetensors=True
16
  )
17
- # model.to(device)
18
 
19
  processor = AutoProcessor.from_pretrained(model_id)
20
 
@@ -34,6 +39,25 @@ arabic_bad_Words = pd.read_csv("arabic_bad_words_dataset.csv")
34
  english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def clean_english_word(word):
38
  cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)
39
  return cleaned_text.lower()
@@ -95,7 +119,7 @@ def format_output_to_list(data):
95
  formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
96
  return formatted_list
97
 
98
- def transcribe(input_audio, audio_language, task, timestamp_type):
99
  if input_audio is None:
100
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
101
 
@@ -129,31 +153,94 @@ def transcribe(input_audio, audio_language, task, timestamp_type):
129
 
130
  return [text, timestamps, foul_words, (sample_rate, audio_data)]
131
 
132
- cache_examples = [
133
- ["arabic_english_audios/audios/english_audio_18.mp3", 'English', 'transcribe', 'word'],
134
- ["arabic_english_audios/audios/english_audio_20.mp3", 'English', 'transcribe', 'word'],
135
- ["arabic_english_audios/audios/english_audio_21.mp3", 'English', 'transcribe', 'word'],
136
- ["arabic_english_audios/audios/english_audio_22.mp3", 'English', 'transcribe', 'word'],
137
- ["arabic_english_audios/audios/english_audio_27.mp3", 'English', 'transcribe', 'word'],
138
- ["arabic_english_audios/audios/english_audio_29.mp3", 'English', 'transcribe', 'word'],
139
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  examples = [
141
  ["arabic_english_audios/audios/arabic_audio_11.mp3", 'Arabic', 'transcribe', 'word'],
142
  ["arabic_english_audios/audios/arabic_audio_12.mp3", 'Arabic', 'transcribe', 'word'],
143
  ["arabic_english_audios/audios/arabic_audio_13.mp3", 'Arabic', 'transcribe', 'word'],
144
 
145
- # ["arabic_english_audios/audios/english_audio_18.mp3", 'English', 'transcribe', 'word'],
146
  ["arabic_english_audios/audios/english_audio_19.mp3", 'English', 'transcribe', 'word'],
147
- # ["arabic_english_audios/audios/english_audio_20.mp3", 'English', 'transcribe', 'word'],
148
- # ["arabic_english_audios/audios/english_audio_21.mp3", 'English', 'transcribe', 'word'],
149
- # ["arabic_english_audios/audios/english_audio_22.mp3", 'English', 'transcribe', 'word'],
150
  ["arabic_english_audios/audios/english_audio_23.mp3", 'English', 'transcribe', 'word'],
151
  ["arabic_english_audios/audios/english_audio_24.mp3", 'English', 'transcribe', 'word'],
152
  ["arabic_english_audios/audios/english_audio_25.mp3", 'English', 'transcribe', 'word'],
153
  ["arabic_english_audios/audios/english_audio_26.mp3", 'English', 'transcribe', 'word'],
154
- # ["arabic_english_audios/audios/english_audio_27.mp3", 'English', 'transcribe', 'word'],
155
  ["arabic_english_audios/audios/english_audio_28.mp3", 'English', 'transcribe', 'word'],
156
- # ["arabic_english_audios/audios/english_audio_29.mp3", 'English', 'transcribe', 'word'],
157
  ["arabic_english_audios/audios/english_audio_30.mp3", 'English', 'transcribe', 'word'],
158
  ["arabic_english_audios/audios/english_audio_31.mp3", 'English', 'transcribe', 'word'],
159
  ["arabic_english_audios/audios/english_audio_32.mp3", 'English', 'transcribe', 'word'],
@@ -175,27 +262,51 @@ examples = [
175
  with gr.Blocks(theme=gr.themes.Default()) as demo:
176
  gr.HTML("<h2 style='text-align: center;'>Transcribing Audio with Timestamps using whisper-large-v3</h2>")
177
  # gr.Markdown("")
178
- with gr.Row():
179
- with gr.Column():
180
- audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio file")
181
- audio_language = gr.Radio(["Arabic", "English"], label="Audio Language")
182
- task = gr.Radio(["transcribe", "translate"], label="Task")
183
- timestamp_type = gr.Radio(["sentence", "word"], label="Timestamp Type")
184
- with gr.Row():
185
- clear_button = gr.ClearButton(value="Clear")
186
- submit_button = gr.Button("Submit", variant="primary", )
187
-
188
- with gr.Column():
189
- transcript_output = gr.Text(label="Transcript")
190
- timestamp_output = gr.Text(label="Timestamps")
191
- foul_words = gr.Text(label="Foul Words")
192
- output_audio = gr.Audio(label="Output Audio", type="numpy")
193
-
194
- cache_examples = gr.Examples(cache_examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=10, cache_examples=True, label="Sample Examples")
195
- non_cache_examples = gr.Examples(examples, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio], fn=transcribe, examples_per_page=50, cache_examples=False, label="Other Examples")
196
-
197
- submit_button.click(fn=transcribe, inputs=[audio_input, audio_language, task, timestamp_type], outputs=[transcript_output, timestamp_output, foul_words, output_audio])
198
- clear_button.add([audio_input, audio_language, task, timestamp_type, transcript_output, timestamp_output, foul_words, output_audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
 
201
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
 
 
3
  from pydub import AudioSegment
4
  from pydub.generators import Sine
5
  import io
6
+ import ffmpeg
7
+ import subprocess
8
+ import torch
9
+ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeVideoClip
10
+ import tempfile
11
+ import numpy as np
12
+ import pandas as pd
13
+ import re
14
+ import scipy.io.wavfile
15
 
16
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
+ model_id = "openai/whisper-tiny"
18
 
19
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
20
  model_id, low_cpu_mem_usage=True, use_safetensors=True
21
  )
22
+ model.to(device)
23
 
24
  processor = AutoProcessor.from_pretrained(model_id)
25
 
 
39
  english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
40
 
41
 
42
+ def load_audio(file: str, sr: int = 16000):
43
+ try:
44
+ # This reads the audio from the video file without creating a separate audio file
45
+ command = [
46
+ "ffmpeg",
47
+ "-i", file,
48
+ "-f", "s16le",
49
+ "-acodec", "pcm_s16le",
50
+ "-ar", str(sr),
51
+ "-ac", "1",
52
+ "-"
53
+ ]
54
+
55
+ out = subprocess.run(command, capture_output=True, check=True).stdout
56
+ except subprocess.CalledProcessError as e:
57
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
58
+
59
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
60
+
61
  def clean_english_word(word):
62
  cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)
63
  return cleaned_text.lower()
 
119
  formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data])
120
  return formatted_list
121
 
122
+ def transcribe_audio(input_audio, audio_language, task, timestamp_type):
123
  if input_audio is None:
124
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
125
 
 
153
 
154
  return [text, timestamps, foul_words, (sample_rate, audio_data)]
155
 
156
+
157
+ def transcribe_video(input_video, video_language, task, timestamp_type):
158
+ # Load the video file
159
+ video = VideoFileClip(input_video)
160
+
161
+ # Extract the audio
162
+ audio = video.audio
163
+
164
+ # Create a temporary file to save the audio
165
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
166
+ audio.write_audiofile(temp_audio_file.name, codec='pcm_s16le')
167
+
168
+ # Load the audio file into an AudioSegment
169
+ audio_segment = AudioSegment.from_file(temp_audio_file.name, format="wav")
170
+
171
+ # Ensure the audio is mono
172
+ if audio_segment.channels > 1:
173
+ audio_segment = audio_segment.set_channels(1)
174
+
175
+ # Save the mono audio to a temporary file
176
+ mono_temp_audio_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
177
+ audio_segment.export(mono_temp_audio_file.name, format="wav")
178
+
179
+ # Save the mono audio to a file
180
+ extracted_audio_path = "extracted_audio_mono.mp3"
181
+ audio_segment.export(extracted_audio_path, format="mp3")
182
+
183
+ # Load the audio as a numpy array
184
+ # sample_rate, audio_array = scipy.io.wavfile.read(mono_temp_audio_file.name)
185
+
186
+ output = pipe(extracted_audio_path, return_timestamps=timestamp_type, generate_kwargs={"task": task})
187
+ text = output['text']
188
+
189
+ timestamps = format_output_to_list(output['chunks'])
190
+
191
+ foul_words, negative_timestamps = classifier(output['chunks'], video_language)
192
+ foul_words = ", ".join(foul_words)
193
+
194
+
195
+ audio_output = mute_audio_range(extracted_audio_path, negative_timestamps)
196
+
197
+ # Resample the output audio to 16kHz
198
+ audio_output = resample_audio(audio_output, 16000)
199
+
200
+ # Save the output audio to a BytesIO object
201
+ output_buffer = io.BytesIO()
202
+ audio_output.export(output_buffer, format="wav")
203
+ output_buffer.seek(0)
204
+
205
+ # Read the audio data from the BytesIO buffer
206
+ sample_rate = audio_output.frame_rate
207
+ audio_data = np.frombuffer(output_buffer.read(), dtype=np.int16)
208
+
209
+ # Save the processed NumPy array to a WAV file
210
+ processed_audio_path = "processed_audio.wav"
211
+ scipy.io.wavfile.write(processed_audio_path, sample_rate, audio_data)
212
+
213
+ # Load the processed audio into a moviepy AudioFileClip
214
+ processed_audio = AudioFileClip(processed_audio_path)
215
+
216
+ # Set the audio of the video to the processed audio
217
+ final_video = video.set_audio(processed_audio)
218
+
219
+ # Save the final video with the combined audio
220
+ final_video_path = "final_video_with_processed_audio.mp4"
221
+ final_video.write_videofile(final_video_path, codec="libx264", audio_codec="aac")
222
+
223
+
224
+ return [text, timestamps, foul_words, final_video_path]
225
+
226
+
227
  examples = [
228
  ["arabic_english_audios/audios/arabic_audio_11.mp3", 'Arabic', 'transcribe', 'word'],
229
  ["arabic_english_audios/audios/arabic_audio_12.mp3", 'Arabic', 'transcribe', 'word'],
230
  ["arabic_english_audios/audios/arabic_audio_13.mp3", 'Arabic', 'transcribe', 'word'],
231
 
232
+ ["arabic_english_audios/audios/english_audio_18.mp3", 'English', 'transcribe', 'word'],
233
  ["arabic_english_audios/audios/english_audio_19.mp3", 'English', 'transcribe', 'word'],
234
+ ["arabic_english_audios/audios/english_audio_20.mp3", 'English', 'transcribe', 'word'],
235
+ ["arabic_english_audios/audios/english_audio_21.mp3", 'English', 'transcribe', 'word'],
236
+ ["arabic_english_audios/audios/english_audio_22.mp3", 'English', 'transcribe', 'word'],
237
  ["arabic_english_audios/audios/english_audio_23.mp3", 'English', 'transcribe', 'word'],
238
  ["arabic_english_audios/audios/english_audio_24.mp3", 'English', 'transcribe', 'word'],
239
  ["arabic_english_audios/audios/english_audio_25.mp3", 'English', 'transcribe', 'word'],
240
  ["arabic_english_audios/audios/english_audio_26.mp3", 'English', 'transcribe', 'word'],
241
+ ["arabic_english_audios/audios/english_audio_27.mp3", 'English', 'transcribe', 'word'],
242
  ["arabic_english_audios/audios/english_audio_28.mp3", 'English', 'transcribe', 'word'],
243
+ ["arabic_english_audios/audios/english_audio_29.mp3", 'English', 'transcribe', 'word'],
244
  ["arabic_english_audios/audios/english_audio_30.mp3", 'English', 'transcribe', 'word'],
245
  ["arabic_english_audios/audios/english_audio_31.mp3", 'English', 'transcribe', 'word'],
246
  ["arabic_english_audios/audios/english_audio_32.mp3", 'English', 'transcribe', 'word'],
 
262
  with gr.Blocks(theme=gr.themes.Default()) as demo:
263
  gr.HTML("<h2 style='text-align: center;'>Transcribing Audio with Timestamps using whisper-large-v3</h2>")
264
  # gr.Markdown("")
265
+ with gr.Tab("Audio"):
266
+ with gr.Row():
267
+ with gr.Column():
268
+ audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio file")
269
+ audio_language = gr.Radio(["Arabic", "English"], label="Audio Language")
270
+ audio_task = gr.Radio(["transcribe", "translate"], label="Task")
271
+ audio_timestamp_type = gr.Radio(["sentence", "word"], label="Timestamp Type")
272
+ with gr.Row():
273
+ audio_clear_button = gr.ClearButton(value="Clear")
274
+ audio_submit_button = gr.Button("Submit", variant="primary", )
275
+
276
+ with gr.Column():
277
+ audio_transcript_output = gr.Text(label="Transcript")
278
+ audio_timestamp_output = gr.Text(label="Timestamps")
279
+ audio_foul_words = gr.Text(label="Foul Words")
280
+ output_audio = gr.Audio(label="Output Audio", type="numpy")
281
+
282
+ examples = gr.Examples(examples, inputs=[audio_input, audio_language, audio_task, audio_timestamp_type], outputs=[audio_transcript_output, audio_timestamp_output, audio_foul_words, output_audio], fn=transcribe_audio, examples_per_page=50, cache_examples=False)
283
+
284
+ audio_submit_button.click(fn=transcribe_audio, inputs=[audio_input, audio_language, audio_task, audio_timestamp_type], outputs=[audio_transcript_output, audio_timestamp_output, audio_foul_words, output_audio])
285
+ audio_clear_button.add([audio_input, audio_language, audio_task, audio_timestamp_type, audio_transcript_output, audio_timestamp_output, audio_foul_words, output_audio])
286
+
287
+
288
+ with gr.Tab("Video"):
289
+ with gr.Row():
290
+ with gr.Column():
291
+ video_input = gr.Video(sources=["upload", 'webcam'], label="Video file")
292
+ video_language = gr.Radio(["Arabic", "English"], label="Video Language")
293
+ video_task = gr.Radio(["transcribe", "translate"], label="Task")
294
+ video_timestamp_type = gr.Radio(["sentence", "word"], label="Timestamp Type")
295
+ with gr.Row():
296
+ video_clear_button = gr.ClearButton(value="Clear")
297
+ video_submit_button = gr.Button("Submit", variant="primary", )
298
+
299
+ with gr.Column():
300
+ video_transcript_output = gr.Text(label="Transcript")
301
+ video_timestamp_output = gr.Text(label="Timestamps")
302
+ video_foul_words = gr.Text(label="Foul Words")
303
+ output_video = gr.Video(label="Output Video")
304
+ # output_video = gr.Audio(label="Output Audio", type="numpy")
305
+
306
+
307
+ video_submit_button.click(fn=transcribe_video, inputs=[video_input, video_language, video_task, video_timestamp_type], outputs=[video_transcript_output, video_timestamp_output, video_foul_words, output_video])
308
+ video_clear_button.add([video_input, video_language, video_task, video_timestamp_type, video_transcript_output, video_timestamp_output, video_foul_words, output_video])
309
+
310
 
311
 
312
  if __name__ == "__main__":