Spaces:

BinKhoaLe1812
/

Interview_AI

Running

LiamKhoaLe commited on 9 days ago

Commit

c45c039

1 Parent(s): 77d7341

Rm resampy Use pydub with ffmpeg

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -11,7 +11,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 WORKDIR /app
 # Install system dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates curl dnsutils gcc openssl && \
     rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 # Install system dependencies
+RUN apt-get update && apt-get install -y ffmpeg \
     ca-certificates curl dnsutils gcc openssl && \
     rm -rf /var/lib/apt/lists/*

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Access site: https://binkhoale1812-interview-ai.hf.space/
 import os
-os.environ["NUMBA_DISABLE_CACHE"] = "1"
 import tempfile
 from pathlib import Path
 from typing import Dict
@@ -18,8 +17,7 @@ from google import genai
 from google.genai import types
 # Audio Transcribe
-from scipy.io import wavfile
-import resampy
 import numpy as np
 ############################################
@@ -135,18 +133,13 @@ async def voice_transcribe(file: UploadFile = File(...)):  # noqa: B008
         tmp_path = tmp.name
     try:
         # ── 1. Transcribe
-        # Load WAV
-        sample_rate, data = wavfile.read(tmp_path)
-        # Convert to float32 if needed
-        if data.dtype != np.float32:
-            data = data.astype(np.float32) / np.iinfo(data.dtype).max
-        # Resample to 16 kHz for Whisper
-        if sample_rate != 16000:
-            data = resampy.resample(data, sample_rate, 16000)
-            sample_rate = 16000
         # Obtain speech and process to tensor
-        speech = data
-        inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs["input_features"].to("cpu")
         generated_ids = model.generate(input_features)
         question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

 # Access site: https://binkhoale1812-interview-ai.hf.space/
 import os
 import tempfile
 from pathlib import Path
 from typing import Dict
 from google.genai import types
 # Audio Transcribe
+from pydub import AudioSegment
 import numpy as np
 ############################################
         tmp_path = tmp.name
     try:
         # ── 1. Transcribe
+        # Load audio using pydub (which handles WebM/Opus/MP3/etc.)
+        audio = AudioSegment.from_file(tmp_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)  # Whisper expects mono 16kHz
+        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)  # normalize int16
         # Obtain speech and process to tensor
+        speech = samples
+        inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
         input_features = inputs["input_features"].to("cpu")
         generated_ids = model.generate(input_features)
         question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

requirements.txt CHANGED Viewed

@@ -8,8 +8,10 @@ python-multipart       # File uploads
 transformers            # For language processing
 torch                   # Just to run transformer so don't remove
 huggingface_hub
-scipy
-resampy
 # Gemini Flash 2.5
 google-genai

 transformers            # For language processing
 torch                   # Just to run transformer so don't remove
 huggingface_hub
+# Audio
+pydub
+ffmpeg-python
 # Gemini Flash 2.5
 google-genai