LiamKhoaLe commited on
Commit
243b6fb
·
1 Parent(s): c45c039

Rm pydub with ffmpeg, Use Whisper

Browse files
Files changed (5) hide show
  1. app.py +42 -44
  2. requirements.txt +3 -1
  3. statics/index.html +7 -7
  4. statics/script.js +7 -12
  5. statics/styles.css +7 -1
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
- import os
3
- import tempfile
4
  from pathlib import Path
5
  from typing import Dict
6
 
@@ -12,12 +11,11 @@ from fastapi.staticfiles import StaticFiles
12
 
13
  # AI + LLM
14
  import torch # For transformer
15
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
16
  from google import genai
17
  from google.genai import types
18
 
19
  # Audio Transcribe
20
- from pydub import AudioSegment
21
  import numpy as np
22
 
23
  ############################################
@@ -29,8 +27,9 @@ if not GEMINI_API_KEY:
29
  raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
30
 
31
  # Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
32
- ASR_MODEL_ID = "openai/whisper-tiny" # ~39 MB
33
  ASR_LANGUAGE = "en" # Force to English for interview setting
 
34
 
35
  ############################################
36
  # ── FastAPI App ───────────────────────────
@@ -55,33 +54,8 @@ app.mount("/statics", StaticFiles(directory="statics"), name="statics")
55
  processor = None
56
  model = None
57
 
58
-
59
- @app.on_event("startup")
60
- async def load_models():
61
- global processor, model
62
- cache_path = Path("model_cache") # local writable path inside Hugging Face Space
63
- processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
64
- model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
65
- model.to("cpu")
66
-
67
-
68
- ############################################
69
- # ── Helpers ───────────────────────────────
70
- ############################################
71
-
72
- def build_prompt(question: str) -> str:
73
- """Craft a prompt that elicits concise, structured answers."""
74
- return (
75
- "You are a helpful career‑coach AI. Answer the following interview "
76
- "question clearly and concisely, offering practical insights when "
77
- "appropriate.\n\n"
78
- f"Interview question: \"{question}\""
79
- )
80
-
81
- def memory_usage_mb() -> float:
82
- return psutil.Process().memory_info().rss / 1_048_576 # bytes→MiB
83
-
84
  # Enable Logging for Debugging
 
85
  import logging
86
  # Set up app-specific logger
87
  logger = logging.getLogger("triage-response")
@@ -95,7 +69,6 @@ logger.addHandler(handler)
95
  for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
96
  logging.getLogger(noisy).setLevel(logging.WARNING)
97
  # Monitor Resources Before Startup
98
- import psutil
99
  def check_system_resources():
100
  memory = psutil.virtual_memory()
101
  cpu = psutil.cpu_percent(interval=1)
@@ -110,6 +83,37 @@ def check_system_resources():
110
  logger.warning("⚠️ High Disk usage detected!")
111
  check_system_resources()
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  ############################################
115
  # ── Routes ────────────────────────────────
@@ -133,18 +137,12 @@ async def voice_transcribe(file: UploadFile = File(...)): # noqa: B008
133
  tmp_path = tmp.name
134
  try:
135
  # ── 1. Transcribe
136
- # Load audio using pydub (which handles WebM/Opus/MP3/etc.)
137
- audio = AudioSegment.from_file(tmp_path)
138
- audio = audio.set_frame_rate(16000).set_channels(1) # Whisper expects mono 16kHz
139
- samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15) # normalize int16
140
- # Obtain speech and process to tensor
141
- speech = samples
142
- inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
143
- input_features = inputs["input_features"].to("cpu")
144
- generated_ids = model.generate(input_features)
145
- question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
146
- if not question:
147
- raise ValueError("Empty transcription")
148
  logger.info(f"[VOICE] Detected transcribe: {question}")
149
  # ── 2. LLM answer
150
  prompt = build_prompt(question)
 
1
  # Access site: https://binkhoale1812-interview-ai.hf.space/
2
+ import os, tempfile
 
3
  from pathlib import Path
4
  from typing import Dict
5
 
 
11
 
12
  # AI + LLM
13
  import torch # For transformer
 
14
  from google import genai
15
  from google.genai import types
16
 
17
  # Audio Transcribe
18
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
19
  import numpy as np
20
 
21
  ############################################
 
27
  raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
28
 
29
  # Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
30
+ ASR_MODEL_ID = "openai/whisper-small.en"
31
  ASR_LANGUAGE = "en" # Force to English for interview setting
32
+ SAMPLE_RATE = 16000
33
 
34
  ############################################
35
  # ── FastAPI App ───────────────────────────
 
54
  processor = None
55
  model = None
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # Enable Logging for Debugging
58
+ import psutil
59
  import logging
60
  # Set up app-specific logger
61
  logger = logging.getLogger("triage-response")
 
69
  for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
70
  logging.getLogger(noisy).setLevel(logging.WARNING)
71
  # Monitor Resources Before Startup
 
72
  def check_system_resources():
73
  memory = psutil.virtual_memory()
74
  cpu = psutil.cpu_percent(interval=1)
 
83
  logger.warning("⚠️ High Disk usage detected!")
84
  check_system_resources()
85
 
86
+ # Startup
87
+ @app.on_event("startup")
88
+ async def load_models():
89
+ global processor, model
90
+ cache = Path("model_cache"); cache.mkdir(exist_ok=True)
91
+ # in startup (Transformer Whisper processing)
92
+ processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
93
+ model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
94
+ # Force English transcription – never translate
95
+ forced = processor.get_decoder_prompt_ids(language="english", task="transcribe")
96
+ model.config.forced_decoder_ids = forced
97
+ model.to("cpu")
98
+ model.eval()
99
+ logger.info("[STARTUP] Whisper loaded ✔")
100
+
101
+
102
+ ############################################
103
+ # ── Helpers ───────────────────────────────
104
+ ############################################
105
+
106
+ def build_prompt(question: str) -> str:
107
+ """Craft a prompt that elicits concise, structured answers."""
108
+ return (
109
+ "You are a helpful career‑coach AI. Answer the following interview "
110
+ "question clearly and concisely, offering practical insights when appropriate.\n"
111
+ "Use markdown for **bold**, *italic*, and bullet‑lists when helpful.\n\n"
112
+ f"Interview question: \"{question}\""
113
+ )
114
+
115
+ def memory_usage_mb() -> float:
116
+ return psutil.Process().memory_info().rss / 1_048_576 # bytes→MiB
117
 
118
  ############################################
119
  # ── Routes ────────────────────────────────
 
137
  tmp_path = tmp.name
138
  try:
139
  # ── 1. Transcribe
140
+ seg = AudioSegment.from_file(tmp_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
141
+ audio = np.array(seg.get_array_of_samples()).astype(np.float32) / (2**15)
142
+ inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
143
+ ids = model.generate(inputs.input_features.to(model.device))
144
+ question = processor.decode(ids[0], skip_special_tokens=True).strip()
145
+ if not question: raise ValueError("Could not detect speech")
 
 
 
 
 
 
146
  logger.info(f"[VOICE] Detected transcribe: {question}")
147
  # ── 2. LLM answer
148
  prompt = build_prompt(question)
requirements.txt CHANGED
@@ -5,13 +5,15 @@ aiofiles # Static file serving
5
  python-multipart # File uploads
6
 
7
  # Voice‑to‑text (Whisper via Transformers)
8
- transformers # For language processing
9
  torch # Just to run transformer so don't remove
10
  huggingface_hub
 
11
 
12
  # Audio
13
  pydub
14
  ffmpeg-python
 
15
 
16
  # Gemini Flash 2.5
17
  google-genai
 
5
  python-multipart # File uploads
6
 
7
  # Voice‑to‑text (Whisper via Transformers)
8
+ transformers # For whisper
9
  torch # Just to run transformer so don't remove
10
  huggingface_hub
11
+ accelerate
12
 
13
  # Audio
14
  pydub
15
  ffmpeg-python
16
+ openai-whisper # pulls tiny‑en / small‑en
17
 
18
  # Gemini Flash 2.5
19
  google-genai
statics/index.html CHANGED
@@ -3,23 +3,23 @@
3
  <head>
4
  <meta charset="UTF-8" />
5
  <title>Interview Q&A Assistant</title>
6
- <meta name="viewport" content="width=device-width, initial-scale=1" />
7
- <link rel="stylesheet" href="/statics/styles.css" />
8
- <link rel="icon" type="image/png" href="/statics/icon.png" />
 
 
 
9
  </head>
10
  <body>
11
  <main class="container">
12
  <h1>Interview Q&amp;A Assistant</h1>
13
  <p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
14
-
15
  <button id="record-button" class="record-btn">🎙 Hold&nbsp;to&nbsp;Ask</button>
16
-
17
  <section class="output-section">
18
  <h2>Your Question</h2>
19
  <pre id="question-output" class="output"></pre>
20
-
21
  <h2>AI&nbsp;Answer</h2>
22
- <pre id="answer-output" class="output"></pre>
23
  </section>
24
  </main>
25
 
 
3
  <head>
4
  <meta charset="UTF-8" />
5
  <title>Interview Q&A Assistant</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1"/>
7
+ <link rel="stylesheet" href="/statics/styles.css"/>
8
+ <link rel="icon" type="image/png" href="/statics/icon.png"/>
9
+ <!-- markdown‑to‑html -->
10
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
11
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/purify.min.js"></script>
12
  </head>
13
  <body>
14
  <main class="container">
15
  <h1>Interview Q&amp;A Assistant</h1>
16
  <p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
 
17
  <button id="record-button" class="record-btn">🎙 Hold&nbsp;to&nbsp;Ask</button>
 
18
  <section class="output-section">
19
  <h2>Your Question</h2>
20
  <pre id="question-output" class="output"></pre>
 
21
  <h2>AI&nbsp;Answer</h2>
22
+ <pre id="answer-output" class="output markdown"></pre>
23
  </section>
24
  </main>
25
 
statics/script.js CHANGED
@@ -19,33 +19,29 @@ function typeEffect(el, text, speed = 30) {
19
  }
20
 
21
  // Audio recording setup
22
- let mediaRecorder = null;
23
- let chunks = [];
24
-
25
  async function initMedia() {
26
  try {
27
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
28
  mediaRecorder = new MediaRecorder(stream);
29
-
30
  mediaRecorder.ondataavailable = e => chunks.push(e.data);
31
-
32
  mediaRecorder.onstop = async () => {
33
  const audioBlob = new Blob(chunks, { type: "audio/wav" });
34
  chunks = [];
35
-
36
  // Build form data
37
  const form = new FormData();
38
  form.append("file", audioBlob, "record.wav");
39
-
40
  // UX feedback
41
  typeEffect(questionEl, "⌛ Transcribing…");
42
  answerEl.textContent = "";
43
-
44
  try {
45
  const res = await fetch("/voice-transcribe", { method: "POST", body: form });
46
  if (!res.ok) throw new Error(`HTTP ${res.status}`);
47
  const data = await res.json();
48
-
49
  typeEffect(questionEl, data.question || "[no speech detected]");
50
  setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
51
  } catch (err) {
@@ -62,7 +58,6 @@ function bindRecordBtn() {
62
  if (!mediaRecorder) return;
63
  recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
64
  recordBtn.addEventListener("mouseup", () => mediaRecorder.stop());
65
-
66
  // Touch devices
67
  recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
68
  recordBtn.addEventListener("touchend", e => { e.preventDefault(); mediaRecorder.stop(); });
@@ -70,6 +65,6 @@ function bindRecordBtn() {
70
 
71
  // Init on page load
72
  window.addEventListener("DOMContentLoaded", async () => {
73
- await initMedia();
74
- bindRecordBtn();
75
  });
 
19
  }
20
 
21
  // Audio recording setup
22
+ let mediaRecorder, chunks = [];
23
+ // Initialise media data
 
24
  async function initMedia() {
25
  try {
26
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
27
  mediaRecorder = new MediaRecorder(stream);
28
+ // Continuously push available data
29
  mediaRecorder.ondataavailable = e => chunks.push(e.data);
30
+ // Stop btn start write and send audio chunk
31
  mediaRecorder.onstop = async () => {
32
  const audioBlob = new Blob(chunks, { type: "audio/wav" });
33
  chunks = [];
 
34
  // Build form data
35
  const form = new FormData();
36
  form.append("file", audioBlob, "record.wav");
 
37
  // UX feedback
38
  typeEffect(questionEl, "⌛ Transcribing…");
39
  answerEl.textContent = "";
 
40
  try {
41
  const res = await fetch("/voice-transcribe", { method: "POST", body: form });
42
  if (!res.ok) throw new Error(`HTTP ${res.status}`);
43
  const data = await res.json();
44
+ // render markdown after a small delay for dramatic effect
45
  typeEffect(questionEl, data.question || "[no speech detected]");
46
  setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
47
  } catch (err) {
 
58
  if (!mediaRecorder) return;
59
  recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
60
  recordBtn.addEventListener("mouseup", () => mediaRecorder.stop());
 
61
  // Touch devices
62
  recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
63
  recordBtn.addEventListener("touchend", e => { e.preventDefault(); mediaRecorder.stop(); });
 
65
 
66
  // Init on page load
67
  window.addEventListener("DOMContentLoaded", async () => {
68
+ try { await initMedia(); bindRecordBtn(); }
69
+ catch (e) { alert("Mic permission required"); }
70
  });
statics/styles.css CHANGED
@@ -32,4 +32,10 @@ h1 { margin-top: 0; text-align: center; color: var(--primary); }
32
  background: #000; color: #0f0; padding: 16px; min-height: 60px;
33
  border-radius: 4px; overflow-x: auto; font-family: var(--mono);
34
  white-space: pre-wrap; word-wrap: break-word;
35
- }
 
 
 
 
 
 
 
32
  background: #000; color: #0f0; padding: 16px; min-height: 60px;
33
  border-radius: 4px; overflow-x: auto; font-family: var(--mono);
34
  white-space: pre-wrap; word-wrap: break-word;
35
+ }
36
+
37
+ /* --- new markdown styling --- */
38
+ .output.markdown h3, .output.markdown h2 { color:#55f; margin:6px 0; }
39
+ .output.markdown strong { font-weight:bold; color:#fff; background:#333; padding:0 4px; border-radius:3px; }
40
+ .output.markdown em { font-style:italic; color:#ffd700; }
41
+ .output.markdown ul { margin:4px 0 4px 20px; }