Spaces:

Ritesh-hf
/

speech-to-text-with-timestamps

Sleeping

Ritesh-hf commited on Jul 5, 2024

Commit

d6c2014

verified ·

1 Parent(s): 0d3b49f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import re
 import scipy.io.wavfile
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-model_id = "openai/whisper-tiny"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, low_cpu_mem_usage=True, use_safetensors=True
@@ -31,7 +31,7 @@ pipe = pipeline(
     max_new_tokens=128,
     chunk_length_s=30,
     batch_size=8,
-    # device=device,
 )
@@ -39,24 +39,24 @@ arabic_bad_Words = pd.read_csv("arabic_bad_words_dataset.csv")
 english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
-def load_audio(file: str, sr: int = 16000):
-    try:
-        # This reads the audio from the video file without creating a separate audio file
-        command = [
-            "ffmpeg",
-            "-i", file,
-            "-f", "s16le",
-            "-acodec", "pcm_s16le",
-            "-ar", str(sr),
-            "-ac", "1",
-            "-"
-        ]
-        out = subprocess.run(command, capture_output=True, check=True).stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 def clean_english_word(word):
     cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)

 import scipy.io.wavfile
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_id = "openai/whisper-large-v3"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, low_cpu_mem_usage=True, use_safetensors=True
     max_new_tokens=128,
     chunk_length_s=30,
     batch_size=8,
+    device=device,
 )
 english_bad_Words = pd.read_csv("english_bad_words_dataset.csv")
+# def load_audio(file: str, sr: int = 16000):
+#     try:
+#         # This reads the audio from the video file without creating a separate audio file
+#         command = [
+#             "ffmpeg",
+#             "-i", file,
+#             "-f", "s16le",
+#             "-acodec", "pcm_s16le",
+#             "-ar", str(sr),
+#             "-ac", "1",
+#             "-"
+#         ]
+#         out = subprocess.run(command, capture_output=True, check=True).stdout
+#     except subprocess.CalledProcessError as e:
+#         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+#     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 def clean_english_word(word):
     cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', word)