Spaces:
Running
Running
Commit
·
c45c039
1
Parent(s):
77d7341
Rm resampy Use pydub with ffmpeg
Browse files- Dockerfile +1 -1
- app.py +7 -14
- requirements.txt +4 -2
Dockerfile
CHANGED
@@ -11,7 +11,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
11 |
WORKDIR /app
|
12 |
|
13 |
# Install system dependencies
|
14 |
-
RUN apt-get update && apt-get install -y
|
15 |
ca-certificates curl dnsutils gcc openssl && \
|
16 |
rm -rf /var/lib/apt/lists/*
|
17 |
|
|
|
11 |
WORKDIR /app
|
12 |
|
13 |
# Install system dependencies
|
14 |
+
RUN apt-get update && apt-get install -y ffmpeg \
|
15 |
ca-certificates curl dnsutils gcc openssl && \
|
16 |
rm -rf /var/lib/apt/lists/*
|
17 |
|
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
import os
|
3 |
-
os.environ["NUMBA_DISABLE_CACHE"] = "1"
|
4 |
import tempfile
|
5 |
from pathlib import Path
|
6 |
from typing import Dict
|
@@ -18,8 +17,7 @@ from google import genai
|
|
18 |
from google.genai import types
|
19 |
|
20 |
# Audio Transcribe
|
21 |
-
from
|
22 |
-
import resampy
|
23 |
import numpy as np
|
24 |
|
25 |
############################################
|
@@ -135,18 +133,13 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
|
|
135 |
tmp_path = tmp.name
|
136 |
try:
|
137 |
# ── 1. Transcribe
|
138 |
-
# Load
|
139 |
-
|
140 |
-
#
|
141 |
-
|
142 |
-
data = data.astype(np.float32) / np.iinfo(data.dtype).max
|
143 |
-
# Resample to 16 kHz for Whisper
|
144 |
-
if sample_rate != 16000:
|
145 |
-
data = resampy.resample(data, sample_rate, 16000)
|
146 |
-
sample_rate = 16000
|
147 |
# Obtain speech and process to tensor
|
148 |
-
speech =
|
149 |
-
inputs = processor(speech, sampling_rate=
|
150 |
input_features = inputs["input_features"].to("cpu")
|
151 |
generated_ids = model.generate(input_features)
|
152 |
question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
import os
|
|
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
from typing import Dict
|
|
|
17 |
from google.genai import types
|
18 |
|
19 |
# Audio Transcribe
|
20 |
+
from pydub import AudioSegment
|
|
|
21 |
import numpy as np
|
22 |
|
23 |
############################################
|
|
|
133 |
tmp_path = tmp.name
|
134 |
try:
|
135 |
# ── 1. Transcribe
|
136 |
+
# Load audio using pydub (which handles WebM/Opus/MP3/etc.)
|
137 |
+
audio = AudioSegment.from_file(tmp_path)
|
138 |
+
audio = audio.set_frame_rate(16000).set_channels(1) # Whisper expects mono 16kHz
|
139 |
+
samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15) # normalize int16
|
|
|
|
|
|
|
|
|
|
|
140 |
# Obtain speech and process to tensor
|
141 |
+
speech = samples
|
142 |
+
inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
|
143 |
input_features = inputs["input_features"].to("cpu")
|
144 |
generated_ids = model.generate(input_features)
|
145 |
question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
requirements.txt
CHANGED
@@ -8,8 +8,10 @@ python-multipart # File uploads
|
|
8 |
transformers # For language processing
|
9 |
torch # Just to run transformer so don't remove
|
10 |
huggingface_hub
|
11 |
-
|
12 |
-
|
|
|
|
|
13 |
|
14 |
# Gemini Flash 2.5
|
15 |
google-genai
|
|
|
8 |
transformers # For language processing
|
9 |
torch # Just to run transformer so don't remove
|
10 |
huggingface_hub
|
11 |
+
|
12 |
+
# Audio
|
13 |
+
pydub
|
14 |
+
ffmpeg-python
|
15 |
|
16 |
# Gemini Flash 2.5
|
17 |
google-genai
|