Spaces:

BinKhoaLe1812
/

Interview_AI

Running

App Files Files Community

LiamKhoaLe commited on 10 days ago

Commit

243b6fb

1 Parent(s): c45c039

Rm pydub with ffmpeg, Use Whisper

Browse files

Files changed (5) hide show

app.py +42 -44
requirements.txt +3 -1
statics/index.html +7 -7
statics/script.js +7 -12
statics/styles.css +7 -1

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Access site: https://binkhoale1812-interview-ai.hf.space/
-import os
-import tempfile
 from pathlib import Path
 from typing import Dict
@@ -12,12 +11,11 @@ from fastapi.staticfiles import StaticFiles
 # AI + LLM
 import torch # For transformer
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from google import genai
 from google.genai import types
 # Audio Transcribe
-from pydub import AudioSegment
 import numpy as np
 ############################################
@@ -29,8 +27,9 @@ if not GEMINI_API_KEY:
     raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
 # Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
-ASR_MODEL_ID = "openai/whisper-tiny"  # ~39 MB
 ASR_LANGUAGE = "en"  # Force to English for interview setting
 ############################################
 # ── FastAPI App ───────────────────────────
@@ -55,33 +54,8 @@ app.mount("/statics", StaticFiles(directory="statics"), name="statics")
 processor = None
 model = None
-@app.on_event("startup")
-async def load_models():
-    global processor, model
-    cache_path = Path("model_cache")  # local writable path inside Hugging Face Space
-    processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
-    model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
-    model.to("cpu")
-############################################
-# ── Helpers ───────────────────────────────
-############################################
-def build_prompt(question: str) -> str:
-    """Craft a prompt that elicits concise, structured answers."""
-    return (
-        "You are a helpful career‑coach AI. Answer the following interview "
-        "question clearly and concisely, offering practical insights when "
-        "appropriate.\n\n"
-        f"Interview question: \"{question}\""
-    )
-def memory_usage_mb() -> float:
-    return psutil.Process().memory_info().rss / 1_048_576  # bytes→MiB
 # Enable Logging for Debugging
 import logging
 # Set up app-specific logger
 logger = logging.getLogger("triage-response")
@@ -95,7 +69,6 @@ logger.addHandler(handler)
 for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
     logging.getLogger(noisy).setLevel(logging.WARNING)
 # Monitor Resources Before Startup
-import psutil
 def check_system_resources():
     memory = psutil.virtual_memory()
     cpu = psutil.cpu_percent(interval=1)
@@ -110,6 +83,37 @@ def check_system_resources():
         logger.warning("⚠️ High Disk usage detected!")
 check_system_resources()
 ############################################
 # ── Routes ────────────────────────────────
@@ -133,18 +137,12 @@ async def voice_transcribe(file: UploadFile = File(...)):  # noqa: B008
         tmp_path = tmp.name
     try:
         # ── 1. Transcribe
-        # Load audio using pydub (which handles WebM/Opus/MP3/etc.)
-        audio = AudioSegment.from_file(tmp_path)
-        audio = audio.set_frame_rate(16000).set_channels(1)  # Whisper expects mono 16kHz
-        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)  # normalize int16
-        # Obtain speech and process to tensor
-        speech = samples
-        inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
-        input_features = inputs["input_features"].to("cpu")
-        generated_ids = model.generate(input_features)
-        question = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-        if not question:
-            raise ValueError("Empty transcription")
         logger.info(f"[VOICE] Detected transcribe: {question}")
         # ── 2. LLM answer
         prompt = build_prompt(question)

 # Access site: https://binkhoale1812-interview-ai.hf.space/
+import os, tempfile
 from pathlib import Path
 from typing import Dict
 # AI + LLM
 import torch # For transformer
 from google import genai
 from google.genai import types
 # Audio Transcribe
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import numpy as np
 ############################################
     raise RuntimeError("GEMINI_API_KEY environment variable must be set!")
 # Tiny Whisper model is light enough for CPU Spaces; change if GPU is available
+ASR_MODEL_ID = "openai/whisper-small.en"
 ASR_LANGUAGE = "en"  # Force to English for interview setting
+SAMPLE_RATE  = 16000
 ############################################
 # ── FastAPI App ───────────────────────────
 processor = None
 model = None
 # Enable Logging for Debugging
+import psutil
 import logging
 # Set up app-specific logger
 logger = logging.getLogger("triage-response")
 for noisy in ["pymongo", "urllib3", "httpx", "uvicorn", "uvicorn.error", "uvicorn.access"]:
     logging.getLogger(noisy).setLevel(logging.WARNING)
 # Monitor Resources Before Startup
 def check_system_resources():
     memory = psutil.virtual_memory()
     cpu = psutil.cpu_percent(interval=1)
         logger.warning("⚠️ High Disk usage detected!")
 check_system_resources()
+# Startup
+@app.on_event("startup")
+async def load_models():
+    global processor, model
+    cache = Path("model_cache"); cache.mkdir(exist_ok=True)
+    # in startup (Transformer Whisper processing)
+    processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID, cache_dir=cache)
+    model      = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID, cache_dir=cache_path)
+    # Force English transcription – never translate
+    forced = processor.get_decoder_prompt_ids(language="english", task="transcribe")
+    model.config.forced_decoder_ids = forced
+    model.to("cpu")
+    model.eval()
+    logger.info("[STARTUP] Whisper loaded ✔")
+############################################
+# ── Helpers ───────────────────────────────
+############################################
+def build_prompt(question: str) -> str:
+    """Craft a prompt that elicits concise, structured answers."""
+    return (
+        "You are a helpful career‑coach AI. Answer the following interview "
+        "question clearly and concisely, offering practical insights when appropriate.\n"
+        "Use markdown for **bold**, *italic*, and bullet‑lists when helpful.\n\n"
+        f"Interview question: \"{question}\""
+    )
+def memory_usage_mb() -> float:
+    return psutil.Process().memory_info().rss / 1_048_576  # bytes→MiB
 ############################################
 # ── Routes ────────────────────────────────
         tmp_path = tmp.name
     try:
         # ── 1. Transcribe
+        seg  = AudioSegment.from_file(tmp_path).set_frame_rate(SAMPLE_RATE).set_channels(1)
+        audio = np.array(seg.get_array_of_samples()).astype(np.float32) / (2**15)
+        inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+        ids    = model.generate(inputs.input_features.to(model.device))
+        question = processor.decode(ids[0], skip_special_tokens=True).strip()
+        if not question: raise ValueError("Could not detect speech")
         logger.info(f"[VOICE] Detected transcribe: {question}")
         # ── 2. LLM answer
         prompt = build_prompt(question)

requirements.txt CHANGED Viewed

@@ -5,13 +5,15 @@ aiofiles               # Static file serving
 python-multipart       # File uploads
 # Voice‑to‑text (Whisper via Transformers)
-transformers            # For language processing
 torch                   # Just to run transformer so don't remove
 huggingface_hub
 # Audio
 pydub
 ffmpeg-python
 # Gemini Flash 2.5
 google-genai

 python-multipart       # File uploads
 # Voice‑to‑text (Whisper via Transformers)
+transformers            # For whisper
 torch                   # Just to run transformer so don't remove
 huggingface_hub
+accelerate
 # Audio
 pydub
 ffmpeg-python
+openai-whisper          # pulls tiny‑en / small‑en
 # Gemini Flash 2.5
 google-genai

statics/index.html CHANGED Viewed

@@ -3,23 +3,23 @@
 <head>
   <meta charset="UTF-8" />
   <title>Interview Q&A Assistant</title>
-  <meta name="viewport" content="width=device-width, initial-scale=1" />
-  <link rel="stylesheet" href="/statics/styles.css" />
-  <link rel="icon" type="image/png" href="/statics/icon.png" />
 </head>
 <body>
   <main class="container">
     <h1>Interview Q&amp;A Assistant</h1>
     <p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
     <button id="record-button" class="record-btn">🎙 Hold&nbsp;to&nbsp;Ask</button>
     <section class="output-section">
       <h2>Your Question</h2>
       <pre id="question-output" class="output"></pre>
       <h2>AI&nbsp;Answer</h2>
-      <pre id="answer-output" class="output"></pre>
     </section>
   </main>

 <head>
   <meta charset="UTF-8" />
   <title>Interview Q&A Assistant</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <link rel="stylesheet" href="/statics/styles.css"/>
+  <link rel="icon" type="image/png" href="/statics/icon.png"/>
+  <!-- markdown‑to‑html -->
+  <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/purify.min.js"></script>
 </head>
 <body>
   <main class="container">
     <h1>Interview Q&amp;A Assistant</h1>
     <p class="subtitle">Hold the button, ask your interview question, release to get an answer.</p>
     <button id="record-button" class="record-btn">🎙 Hold&nbsp;to&nbsp;Ask</button>
     <section class="output-section">
       <h2>Your Question</h2>
       <pre id="question-output" class="output"></pre>
       <h2>AI&nbsp;Answer</h2>
+      <pre id="answer-output" class="output markdown"></pre>
     </section>
   </main>

statics/script.js CHANGED Viewed

@@ -19,33 +19,29 @@ function typeEffect(el, text, speed = 30) {
 }
 // Audio recording setup
-let mediaRecorder = null;
-let chunks = [];
 async function initMedia() {
   try {
     const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
     mediaRecorder = new MediaRecorder(stream);
     mediaRecorder.ondataavailable = e => chunks.push(e.data);
     mediaRecorder.onstop = async () => {
       const audioBlob = new Blob(chunks, { type: "audio/wav" });
       chunks = [];
       // Build form data
       const form = new FormData();
       form.append("file", audioBlob, "record.wav");
       // UX feedback
       typeEffect(questionEl, "⌛ Transcribing…");
       answerEl.textContent = "";
       try {
         const res = await fetch("/voice-transcribe", { method: "POST", body: form });
         if (!res.ok) throw new Error(`HTTP ${res.status}`);
         const data = await res.json();
         typeEffect(questionEl, data.question || "[no speech detected]");
         setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
       } catch (err) {
@@ -62,7 +58,6 @@ function bindRecordBtn() {
   if (!mediaRecorder) return;
   recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
   recordBtn.addEventListener("mouseup",   () => mediaRecorder.stop());
   // Touch devices
   recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
   recordBtn.addEventListener("touchend",   e => { e.preventDefault(); mediaRecorder.stop(); });
@@ -70,6 +65,6 @@ function bindRecordBtn() {
 // Init on page load
 window.addEventListener("DOMContentLoaded", async () => {
-  await initMedia();
-  bindRecordBtn();
 });

 }
 // Audio recording setup
+let mediaRecorder, chunks = [];
+// Initialise media data
 async function initMedia() {
   try {
     const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
     mediaRecorder = new MediaRecorder(stream);
+    // Continuously push available data
     mediaRecorder.ondataavailable = e => chunks.push(e.data);
+    // Stop btn start write and send audio chunk
     mediaRecorder.onstop = async () => {
       const audioBlob = new Blob(chunks, { type: "audio/wav" });
       chunks = [];
       // Build form data
       const form = new FormData();
       form.append("file", audioBlob, "record.wav");
       // UX feedback
       typeEffect(questionEl, "⌛ Transcribing…");
       answerEl.textContent = "";
       try {
         const res = await fetch("/voice-transcribe", { method: "POST", body: form });
         if (!res.ok) throw new Error(`HTTP ${res.status}`);
         const data = await res.json();
+        // render markdown after a small delay for dramatic effect
         typeEffect(questionEl, data.question || "[no speech detected]");
         setTimeout(() => typeEffect(answerEl, data.answer || "[no answer]"), 500);
       } catch (err) {
   if (!mediaRecorder) return;
   recordBtn.addEventListener("mousedown", () => mediaRecorder.start());
   recordBtn.addEventListener("mouseup",   () => mediaRecorder.stop());
   // Touch devices
   recordBtn.addEventListener("touchstart", e => { e.preventDefault(); mediaRecorder.start(); });
   recordBtn.addEventListener("touchend",   e => { e.preventDefault(); mediaRecorder.stop(); });
 // Init on page load
 window.addEventListener("DOMContentLoaded", async () => {
+  try { await initMedia(); bindRecordBtn(); }
+  catch (e) { alert("Mic permission required"); }
 });

statics/styles.css CHANGED Viewed

@@ -32,4 +32,10 @@ h1 { margin-top: 0; text-align: center; color: var(--primary); }
   background: #000; color: #0f0; padding: 16px; min-height: 60px;
   border-radius: 4px; overflow-x: auto; font-family: var(--mono);
   white-space: pre-wrap; word-wrap: break-word;
-}

   background: #000; color: #0f0; padding: 16px; min-height: 60px;
   border-radius: 4px; overflow-x: auto; font-family: var(--mono);
   white-space: pre-wrap; word-wrap: break-word;
+}
+/* --- new markdown styling --- */
+.output.markdown h3, .output.markdown h2 { color:#55f; margin:6px 0; }
+.output.markdown strong   { font-weight:bold; color:#fff; background:#333; padding:0 4px; border-radius:3px; }
+.output.markdown em       { font-style:italic;  color:#ffd700; }
+.output.markdown ul       { margin:4px 0 4px 20px; }