Spaces:
Sleeping
Sleeping
File size: 3,595 Bytes
1f75494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from fastapi import FastAPI, File, UploadFile
import uvicorn
import openai
import torch
import torchaudio
import torchaudio.transforms as T
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
import whisper
import os
app = FastAPI()
# Load Whisper model for transcription
whisper_model = whisper.load_model("small")
# Load speech emotion recognition model
ser_model_name = "superb/wav2vec2-base-superb-er"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ser_model_name)
ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
# OpenAI API Key
openai.api_key = os.getenv("OPENAI_API_KEY") # Ensure you set this in the terminal before running
@app.post("/process_audio/")
async def process_audio(file: UploadFile = File(...)):
try:
print(f"β
File received: {file.filename}")
# Save audio
audio_path = "temp_audio.wav"
with open(audio_path, "wb") as f:
f.write(await file.read())
print("β
Audio saved successfully!")
# π’ TEST 1: Check if the file is corrupted
try:
waveform, sample_rate = torchaudio.load(audio_path)
print(f"β
Audio loaded! Shape: {waveform.shape}, Sample Rate: {sample_rate}")
except Exception as e:
return {"error": f"β Audio loading failed: {e}"}
# π’ TEST 2: Whisper Transcription
try:
transcription = whisper_model.transcribe(audio_path)["text"]
print(f"β
Whisper Transcription: {transcription}")
except Exception as e:
return {"error": f"β Whisper failed: {e}"}
# π’ TEST 3: Emotion Recognition
try:
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sample_rate != 16000:
resampler = T.Resample(sample_rate, 16000)
waveform = resampler(waveform)
inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = ser_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=-1).item()
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
emotion_detected = emotions[predicted_class] if predicted_class < len(emotions) else "unknown"
print(f"β
Emotion Detected: {emotion_detected}")
except Exception as e:
return {"error": f"β Emotion recognition failed: {e}"}
# π’ TEST 4: OpenAI API Summarization
try:
summary_response = openai.ChatCompletion.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "Summarize the following text."},
{"role": "user", "content": transcription}
]
)
summary = summary_response["choices"][0]["message"]["content"]
print(f"β
OpenAI Summary: {summary}")
except Exception as e:
return {"error": f"β OpenAI Summarization failed: {e}"}
return {
"transcription": transcription,
"emotion": emotion_detected,
"summary": summary
}
except Exception as e:
print(f"β Error in process_audio: {e}")
return {"error": str(e)}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
|