LiamKhoaLe commited on
Commit
c45c039
·
1 Parent(s): 77d7341

Rm resampy Use pydub with ffmpeg

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. app.py +7 -14
  3. requirements.txt +4 -2
Dockerfile CHANGED
@@ -11,7 +11,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
11
  WORKDIR /app
12
 
13
  # Install system dependencies
14
- RUN apt-get update && apt-get install -y --no-install-recommends \
15
  ca-certificates curl dnsutils gcc openssl && \
16
  rm -rf /var/lib/apt/lists/*
17
 
 
11
  WORKDIR /app
12
 
13
  # Install system dependencies
14
+ RUN apt-get update && apt-get install -y ffmpeg \
15
  ca-certificates curl dnsutils gcc openssl && \
16
  rm -rf /var/lib/apt/lists/*
17
 
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
  import os
3
- os.environ["NUMBA_DISABLE_CACHE"] = "1"
4
  import tempfile
5
  from pathlib import Path
6
  from typing import Dict
@@ -18,8 +17,7 @@ from google import genai
18
  from google.genai import types
19
 
20
  # Audio Transcribe
21
- from scipy.io import wavfile
22
- import resampy
23
  import numpy as np
24
 
25
  ############################################
@@ -135,18 +133,13 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
135
  tmp_path = tmp.name
136
  try:
137
  # ── 1. Transcribe
138
- # Load WAV
139
- sample_rate, data = wavfile.read(tmp_path)
140
- # Convert to float32 if needed
141
- if data.dtype != np.float32:
142
- data = data.astype(np.float32) / np.iinfo(data.dtype).max
143
- # Resample to 16 kHz for Whisper
144
- if sample_rate != 16000:
145
- data = resampy.resample(data, sample_rate, 16000)
146
- sample_rate = 16000
147
  # Obtain speech and process to tensor
148
- speech = data
149
- inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
150
  input_features = inputs["input_features"].to("cpu")
151
  generated_ids = model.generate(input_features)
152
  question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
  import os
 
3
  import tempfile
4
  from pathlib import Path
5
  from typing import Dict
 
17
  from google.genai import types
18
 
19
  # Audio Transcribe
20
+ from pydub import AudioSegment
 
21
  import numpy as np
22
 
23
  ############################################
 
133
  tmp_path = tmp.name
134
  try:
135
  # ── 1. Transcribe
136
+ # Load audio using pydub (which handles WebM/Opus/MP3/etc.)
137
+ audio = AudioSegment.from_file(tmp_path)
138
+ audio = audio.set_frame_rate(16000).set_channels(1) # Whisper expects mono 16kHz
139
+ samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15) # normalize int16
 
 
 
 
 
140
  # Obtain speech and process to tensor
141
+ speech = samples
142
+ inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
143
  input_features = inputs["input_features"].to("cpu")
144
  generated_ids = model.generate(input_features)
145
  question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
requirements.txt CHANGED
@@ -8,8 +8,10 @@ python-multipart # File uploads
8
  transformers # For language processing
9
  torch # Just to run transformer so don't remove
10
  huggingface_hub
11
- scipy
12
- resampy
 
 
13
 
14
  # Gemini Flash 2.5
15
  google-genai
 
8
  transformers # For language processing
9
  torch # Just to run transformer so don't remove
10
  huggingface_hub
11
+
12
+ # Audio
13
+ pydub
14
+ ffmpeg-python
15
 
16
  # Gemini Flash 2.5
17
  google-genai