Spaces:
Running
Running
Commit
·
243b6fb
1
Parent(s):
c45c039
Rm pydub with ffmpeg, Use Whisper
Browse files- app.py +42 -44
- requirements.txt +3 -1
- statics/index.html +7 -7
- statics/script.js +7 -12
- statics/styles.css +7 -1
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
-
import os
|
3 |
-
import tempfile
|
4 |
from pathlib import Path
|
5 |
from typing import Dict
|
6 |
|
@@ -12,12 +11,11 @@ from fastapi.staticfiles import StaticFiles
|
|
12 |
|
13 |
# AI + LLM
|
14 |
import torch # For transformer
|
15 |
-
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
16 |
from google import genai
|
17 |
from google.genai import types
|
18 |
|
19 |
# Audio Transcribe
|
20 |
-
from
|
21 |
import numpy as np
|
22 |
|
23 |
############################################
|
@@ -29,8 +27,9 @@ if not GEMINI_API_KEY:
|
|
29 |
raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
|
30 |
|
31 |
# Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
|
32 |
-
ASR_MODEL_ID = "openai/whisper-
|
33 |
ASR_LANGUAGE = "en" # Force to English for interview setting
|
|
|
34 |
|
35 |
############################################
|
36 |
# ── FastAPI App ───────────────────────────
|
@@ -55,33 +54,8 @@ app.mount("/statics", StaticFiles(directory="statics"), name="statics")
|
|
55 |
processor = None
|
56 |
model = None
|
57 |
|
58 |
-
|
59 |
-
@app.on_event("startup")
|
60 |
-
async def load_models():
|
61 |
-
global processor, model
|
62 |
-
cache_path = Path("model_cache") # local writable path inside Hugging Face Space
|
63 |
-
processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
|
64 |
-
model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
|
65 |
-
model.to("cpu")
|
66 |
-
|
67 |
-
|
68 |
-
############################################
|
69 |
-
# ── Helpers ───────────────────────────────
|
70 |
-
############################################
|
71 |
-
|
72 |
-
def build_prompt(question: str) -> str:
|
73 |
-
"""Craft a prompt that elicits concise, structured answers."""
|
74 |
-
return (
|
75 |
-
"You are a helpful career‑coach AI. Answer the following interview "
|
76 |
-
"question clearly and concisely, offering practical insights when "
|
77 |
-
"appropriate.\n\n"
|
78 |
-
f"Interview question: \"{question}\""
|
79 |
-
)
|
80 |
-
|
81 |
-
def memory_usage_mb() -> float:
|
82 |
-
return psutil.Process().memory_info().rss / 1_048_576 # bytes→MiB
|
83 |
-
|
84 |
# Enable Logging for Debugging
|
|
|
85 |
import logging
|
86 |
# Set up app-specific logger
|
87 |
logger = logging.getLogger("triage-response")
|
@@ -95,7 +69,6 @@ logger.addHandler(handler)
|
|
95 |
for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
|
96 |
logging.getLogger(noisy).setLevel(logging.WARNING)
|
97 |
# Monitor Resources Before Startup
|
98 |
-
import psutil
|
99 |
def check_system_resources():
|
100 |
memory = psutil.virtual_memory()
|
101 |
cpu = psutil.cpu_percent(interval=1)
|
@@ -110,6 +83,37 @@ def check_system_resources():
|
|
110 |
logger.warning("⚠️ High Disk usage detected!")
|
111 |
check_system_resources()
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
############################################
|
115 |
# ── Routes ────────────────────────────────
|
@@ -133,18 +137,12 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
|
|
133 |
tmp_path = tmp.name
|
134 |
try:
|
135 |
# ── 1. Transcribe
|
136 |
-
|
137 |
-
audio =
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
|
143 |
-
input_features = inputs["input_features"].to("cpu")
|
144 |
-
generated_ids = model.generate(input_features)
|
145 |
-
question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
146 |
-
if not question:
|
147 |
-
raise ValueError("Empty transcription")
|
148 |
logger.info(f"[VOICE] Detected transcribe: {question}")
|
149 |
# ── 2. LLM answer
|
150 |
prompt = build_prompt(question)
|
|
|
1 |
# Access site: https://binkhoale1812-interview-ai.hf.space/
|
2 |
+
import os, tempfile
|
|
|
3 |
from pathlib import Path
|
4 |
from typing import Dict
|
5 |
|
|
|
11 |
|
12 |
# AI + LLM
|
13 |
import torch # For transformer
|
|
|
14 |
from google import genai
|
15 |
from google.genai import types
|
16 |
|
17 |
# Audio Transcribe
|
18 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
19 |
import numpy as np
|
20 |
|
21 |
############################################
|
|
|
27 |
raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
|
28 |
|
29 |
# Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
|
30 |
+
ASR_MODEL_ID = "openai/whisper-small.en"
|
31 |
ASR_LANGUAGE = "en" # Force to English for interview setting
|
32 |
+
SAMPLE_RATE = 16000
|
33 |
|
34 |
############################################
|
35 |
# ── FastAPI App ───────────────────────────
|
|
|
54 |
processor = None
|
55 |
model = None
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# Enable Logging for Debugging
|
58 |
+
import psutil
|
59 |
import logging
|
60 |
# Set up app-specific logger
|
61 |
logger = logging.getLogger("triage-response")
|
|
|
69 |
for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
|
70 |
logging.getLogger(noisy).setLevel(logging.WARNING)
|
71 |
# Monitor Resources Before Startup
|
|
|
72 |
def check_system_resources():
|
73 |
memory = psutil.virtual_memory()
|
74 |
cpu = psutil.cpu_percent(interval=1)
|
|
|
83 |
logger.warning("⚠️ High Disk usage detected!")
|
84 |
check_system_resources()
|
85 |
|
86 |
+
# Startup
|
87 |
+
@app.on_event("startup")
|
88 |
+
async def load_models():
|
89 |
+
global processor, model
|
90 |
+
cache = Path("model_cache"); cache.mkdir(exist_ok=True)
|
91 |
+
# in startup (Transformer Whisper processing)
|
92 |
+
processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
|
93 |
+
model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
|
94 |
+
# Force English transcription – never translate
|
95 |
+
forced = processor.get_decoder_prompt_ids(language="english", task="transcribe")
|
96 |
+
model.config.forced_decoder_ids = forced
|
97 |
+
model.to("cpu")
|
98 |
+
model.eval()
|
99 |
+
logger.info("[STARTUP] Whisper loaded ✔")
|
100 |
+
|
101 |
+
|
102 |
+
############################################
|
103 |
+
# ── Helpers ───────────────────────────────
|
104 |
+
############################################
|
105 |
+
|
106 |
+
def build_prompt(question: str) -> str:
|
107 |
+
"""Craft a prompt that elicits concise, structured answers."""
|
108 |
+
return (
|
109 |
+
"You are a helpful career‑coach AI. Answer the following interview "
|
110 |
+
"question clearly and concisely, offering practical insights when appropriate.\n"
|
111 |
+
"Use markdown for **bold**, *italic*, and bullet‑lists when helpful.\n\n"
|
112 |
+
f"Interview question: \"{question}\""
|
113 |
+
)
|
114 |
+
|
115 |
+
def memory_usage_mb() -> float:
|
116 |
+
return psutil.Process().memory_info().rss / 1_048_576 # bytes→MiB
|
117 |
|
118 |
############################################
|
119 |
# ── Routes ────────────────────────────────
|
|
|
137 |
tmp_path = tmp.name
|
138 |
try:
|
139 |
# ── 1. Transcribe
|
140 |
+
seg = AudioSegment.from_file(tmp_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
|
141 |
+
audio = np.array(seg.get_array_of_samples()).astype(np.float32) / (2**15)
|
142 |
+
inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
|
143 |
+
ids = model.generate(inputs.input_features.to(model.device))
|
144 |
+
question = processor.decode(ids[0], skip_special_tokens=True).strip()
|
145 |
+
if not question: raise ValueError("Could not detect speech")
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
logger.info(f"[VOICE] Detected transcribe: {question}")
|
147 |
# ── 2. LLM answer
|
148 |
prompt = build_prompt(question)
|
requirements.txt
CHANGED
@@ -5,13 +5,15 @@ aiofiles # Static file serving
|
|
5 |
python-multipart # File uploads
|
6 |
|
7 |
# Voice‑to‑text (Whisper via Transformers)
|
8 |
-
transformers # For
|
9 |
torch # Just to run transformer so don't remove
|
10 |
huggingface_hub
|
|
|
11 |
|
12 |
# Audio
|
13 |
pydub
|
14 |
ffmpeg-python
|
|
|
15 |
|
16 |
# Gemini Flash 2.5
|
17 |
google-genai
|
|
|
5 |
python-multipart # File uploads
|
6 |
|
7 |
# Voice‑to‑text (Whisper via Transformers)
|
8 |
+
transformers # For whisper
|
9 |
torch # Just to run transformer so don't remove
|
10 |
huggingface_hub
|
11 |
+
accelerate
|
12 |
|
13 |
# Audio
|
14 |
pydub
|
15 |
ffmpeg-python
|
16 |
+
openai-whisper # pulls tiny‑en / small‑en
|
17 |
|
18 |
# Gemini Flash 2.5
|
19 |
google-genai
|
statics/index.html
CHANGED
@@ -3,23 +3,23 @@
|
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>Interview Q&A Assistant</title>
|
6 |
-
<meta name="viewport" content="width=device-width, initial-scale=1"
|
7 |
-
<link rel="stylesheet" href="/statics/styles.css"
|
8 |
-
<link rel="icon" type="image/png" href="/statics/icon.png"
|
|
|
|
|
|
|
9 |
</head>
|
10 |
<body>
|
11 |
<main class="container">
|
12 |
<h1>Interview Q&A Assistant</h1>
|
13 |
<p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
|
14 |
-
|
15 |
<button id="record-button" class="record-btn">🎙 Hold to Ask</button>
|
16 |
-
|
17 |
<section class="output-section">
|
18 |
<h2>Your Question</h2>
|
19 |
<pre id="question-output" class="output"></pre>
|
20 |
-
|
21 |
<h2>AI Answer</h2>
|
22 |
-
<pre id="answer-output" class="output"></pre>
|
23 |
</section>
|
24 |
</main>
|
25 |
|
|
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>Interview Q&A Assistant</title>
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
7 |
+
<link rel="stylesheet" href="/statics/styles.css"/>
|
8 |
+
<link rel="icon" type="image/png" href="/statics/icon.png"/>
|
9 |
+
<!-- markdown‑to‑html -->
|
10 |
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
11 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/purify.min.js"></script>
|
12 |
</head>
|
13 |
<body>
|
14 |
<main class="container">
|
15 |
<h1>Interview Q&A Assistant</h1>
|
16 |
<p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
|
|
|
17 |
<button id="record-button" class="record-btn">🎙 Hold to Ask</button>
|
|
|
18 |
<section class="output-section">
|
19 |
<h2>Your Question</h2>
|
20 |
<pre id="question-output" class="output"></pre>
|
|
|
21 |
<h2>AI Answer</h2>
|
22 |
+
<pre id="answer-output" class="output markdown"></pre>
|
23 |
</section>
|
24 |
</main>
|
25 |
|
statics/script.js
CHANGED
@@ -19,33 +19,29 @@ function typeEffect(el, text, speed = 30) {
|
|
19 |
}
|
20 |
|
21 |
// Audio recording setup
|
22 |
-
let mediaRecorder =
|
23 |
-
|
24 |
-
|
25 |
async function initMedia() {
|
26 |
try {
|
27 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
28 |
mediaRecorder = new MediaRecorder(stream);
|
29 |
-
|
30 |
mediaRecorder.ondataavailable = e => chunks.push(e.data);
|
31 |
-
|
32 |
mediaRecorder.onstop = async () => {
|
33 |
const audioBlob = new Blob(chunks, { type: "audio/wav" });
|
34 |
chunks = [];
|
35 |
-
|
36 |
// Build form data
|
37 |
const form = new FormData();
|
38 |
form.append("file", audioBlob, "record.wav");
|
39 |
-
|
40 |
// UX feedback
|
41 |
typeEffect(questionEl, "⌛ Transcribing…");
|
42 |
answerEl.textContent = "";
|
43 |
-
|
44 |
try {
|
45 |
const res = await fetch("/voice-transcribe", { method: "POST", body: form });
|
46 |
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
47 |
const data = await res.json();
|
48 |
-
|
49 |
typeEffect(questionEl, data.question || "[no speech detected]");
|
50 |
setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
|
51 |
} catch (err) {
|
@@ -62,7 +58,6 @@ function bindRecordBtn() {
|
|
62 |
if (!mediaRecorder) return;
|
63 |
recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
|
64 |
recordBtn.addEventListener("mouseup", () => mediaRecorder.stop());
|
65 |
-
|
66 |
// Touch devices
|
67 |
recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
|
68 |
recordBtn.addEventListener("touchend", e => { e.preventDefault(); mediaRecorder.stop(); });
|
@@ -70,6 +65,6 @@ function bindRecordBtn() {
|
|
70 |
|
71 |
// Init on page load
|
72 |
window.addEventListener("DOMContentLoaded", async () => {
|
73 |
-
await initMedia();
|
74 |
-
|
75 |
});
|
|
|
19 |
}
|
20 |
|
21 |
// Audio recording setup
|
22 |
+
let mediaRecorder, chunks = [];
|
23 |
+
// Initialise media data
|
|
|
24 |
async function initMedia() {
|
25 |
try {
|
26 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
27 |
mediaRecorder = new MediaRecorder(stream);
|
28 |
+
// Continuously push available data
|
29 |
mediaRecorder.ondataavailable = e => chunks.push(e.data);
|
30 |
+
// Stop btn start write and send audio chunk
|
31 |
mediaRecorder.onstop = async () => {
|
32 |
const audioBlob = new Blob(chunks, { type: "audio/wav" });
|
33 |
chunks = [];
|
|
|
34 |
// Build form data
|
35 |
const form = new FormData();
|
36 |
form.append("file", audioBlob, "record.wav");
|
|
|
37 |
// UX feedback
|
38 |
typeEffect(questionEl, "⌛ Transcribing…");
|
39 |
answerEl.textContent = "";
|
|
|
40 |
try {
|
41 |
const res = await fetch("/voice-transcribe", { method: "POST", body: form });
|
42 |
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
43 |
const data = await res.json();
|
44 |
+
// render markdown after a small delay for dramatic effect
|
45 |
typeEffect(questionEl, data.question || "[no speech detected]");
|
46 |
setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
|
47 |
} catch (err) {
|
|
|
58 |
if (!mediaRecorder) return;
|
59 |
recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
|
60 |
recordBtn.addEventListener("mouseup", () => mediaRecorder.stop());
|
|
|
61 |
// Touch devices
|
62 |
recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
|
63 |
recordBtn.addEventListener("touchend", e => { e.preventDefault(); mediaRecorder.stop(); });
|
|
|
65 |
|
66 |
// Init on page load
|
67 |
window.addEventListener("DOMContentLoaded", async () => {
|
68 |
+
try { await initMedia(); bindRecordBtn(); }
|
69 |
+
catch (e) { alert("Mic permission required"); }
|
70 |
});
|
statics/styles.css
CHANGED
@@ -32,4 +32,10 @@ h1 { margin-top: 0; text-align: center; color: var(--primary); }
|
|
32 |
background: #000; color: #0f0; padding: 16px; min-height: 60px;
|
33 |
border-radius: 4px; overflow-x: auto; font-family: var(--mono);
|
34 |
white-space: pre-wrap; word-wrap: break-word;
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
background: #000; color: #0f0; padding: 16px; min-height: 60px;
|
33 |
border-radius: 4px; overflow-x: auto; font-family: var(--mono);
|
34 |
white-space: pre-wrap; word-wrap: break-word;
|
35 |
+
}
|
36 |
+
|
37 |
+
/* --- new markdown styling --- */
|
38 |
+
.output.markdown h3, .output.markdown h2 { color:#55f; margin:6px 0; }
|
39 |
+
.output.markdown strong { font-weight:bold; color:#fff; background:#333; padding:0 4px; border-radius:3px; }
|
40 |
+
.output.markdown em { font-style:italic; color:#ffd700; }
|
41 |
+
.output.markdown ul { margin:4px 0 4px 20px; }
|