AI_Virtual / app.py
nikhildsst's picture
Update app.py
9e9a95b verified
import gradio as gr
from fastapi import FastAPI
import librosa
import openai
from transformers import pipeline
import requests
import os
from pydantic import BaseModel
import numpy as np
# Initialize FastAPI
app = FastAPI()
# Initialize emotion classifier
text_emotion_classifier = pipeline("text-classification",
model="bhadresh-savani/distilbert-base-uncased-emotion",
device=-1)
# Environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
VOICE_ID = os.getenv("VOICE_ID", "9BWtsMINqrJLrRacOk9x")
def analyze_text_emotion(text):
try:
emotion_result = text_emotion_classifier(text)
emotion_data = emotion_result[0]
return f"Emotion: {emotion_data['label']}\nConfidence: {emotion_data['score']:.2f}"
except Exception as e:
return f"Error: {str(e)}"
def analyze_voice_emotion(audio):
try:
if audio is None:
return "Please upload an audio file"
# Ensure audio is loaded with correct format
sr = audio[0]
y = audio[1]
# Check if the audio data is already in float format; if not, convert it
if y.dtype != 'float32':
y = y.astype('float32')
# Calculate features and convert numpy values to Python scalars
pitch = float(librosa.feature.spectral_centroid(y=y, sr=sr).mean())
intensity = float(librosa.feature.rms(y=y).mean())
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
# Convert tempo to Python float to avoid numpy formatting issues
tempo = float(tempo)
# Determine emotion based on features
if pitch < 150 and intensity < 0.02:
emotion = "sadness"
elif pitch > 200 and intensity > 0.05:
emotion = "anger"
elif pitch > 150 and intensity < 0.03:
emotion = "joy"
else:
emotion = "anxiety"
# Format the output using Python floats instead of numpy values
return "Emotion: {}\nPitch: {:.2f}\nIntensity: {:.2f}\nTempo: {:.2f}".format(
emotion, pitch, intensity, tempo
)
except Exception as e:
return f"Error analyzing audio: {str(e)}"
def chat_and_tts(message):
try:
if not OPENAI_API_KEY or not ELEVEN_LABS_API_KEY:
return "API keys not configured", None
openai.api_key = OPENAI_API_KEY
chat_response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": message},
]
)
response_text = chat_response['choices'][0]['message']['content'].strip()
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
headers = {
"xi-api-key": ELEVEN_LABS_API_KEY,
"Content-Type": "application/json"
}
data = {
"text": response_text,
"voice_settings": {
"stability": 0.75,
"similarity_boost": 0.75
}
}
response = requests.post(url, json=data, headers=headers)
if response.status_code != 200:
return response_text, None
audio_path = "response.mp3"
with open(audio_path, "wb") as f:
f.write(response.content)
return response_text, audio_path
except Exception as e:
return f"Error: {str(e)}", None
# Create Gradio interface
demo = gr.Blocks(title="AI Therapist")
with demo:
gr.Markdown("# AI Virtual Therapist")
with gr.Tab("Text Emotion Analysis"):
text_input = gr.Textbox(label="Enter text")
text_button = gr.Button("Analyze Text Emotion")
text_output = gr.Textbox(label="Emotion Analysis Result")
text_button.click(analyze_text_emotion, inputs=text_input, outputs=text_output)
with gr.Tab("Voice Emotion Analysis"):
audio_input = gr.Audio(label="Upload Audio", type="numpy")
audio_button = gr.Button("Analyze Voice Emotion")
audio_output = gr.Textbox(label="Voice Analysis Result")
audio_button.click(analyze_voice_emotion, inputs=audio_input, outputs=audio_output)
with gr.Tab("Chat with TTS"):
chat_input = gr.Textbox(label="Enter your message")
chat_button = gr.Button("Send Message")
chat_output = gr.Textbox(label="Assistant Response")
audio_output = gr.Audio(label="Voice Response")
chat_button.click(chat_and_tts, inputs=chat_input, outputs=[chat_output, audio_output])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)