import gradio as gr from fastapi import FastAPI import librosa import openai from transformers import pipeline import requests import os from pydantic import BaseModel import numpy as np # Initialize FastAPI app = FastAPI() # Initialize emotion classifier text_emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", device=-1) # Environment variables OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") VOICE_ID = os.getenv("VOICE_ID", "9BWtsMINqrJLrRacOk9x") def analyze_text_emotion(text): try: emotion_result = text_emotion_classifier(text) emotion_data = emotion_result[0] return f"Emotion: {emotion_data['label']}\nConfidence: {emotion_data['score']:.2f}" except Exception as e: return f"Error: {str(e)}" def analyze_voice_emotion(audio): try: if audio is None: return "Please upload an audio file" # Ensure audio is loaded with correct format sr = audio[0] y = audio[1] # Check if the audio data is already in float format; if not, convert it if y.dtype != 'float32': y = y.astype('float32') # Calculate features and convert numpy values to Python scalars pitch = float(librosa.feature.spectral_centroid(y=y, sr=sr).mean()) intensity = float(librosa.feature.rms(y=y).mean()) tempo, _ = librosa.beat.beat_track(y=y, sr=sr) # Convert tempo to Python float to avoid numpy formatting issues tempo = float(tempo) # Determine emotion based on features if pitch < 150 and intensity < 0.02: emotion = "sadness" elif pitch > 200 and intensity > 0.05: emotion = "anger" elif pitch > 150 and intensity < 0.03: emotion = "joy" else: emotion = "anxiety" # Format the output using Python floats instead of numpy values return "Emotion: {}\nPitch: {:.2f}\nIntensity: {:.2f}\nTempo: {:.2f}".format( emotion, pitch, intensity, tempo ) except Exception as e: return f"Error analyzing audio: {str(e)}" def chat_and_tts(message): try: if not OPENAI_API_KEY or not ELEVEN_LABS_API_KEY: return "API keys not configured", None openai.api_key = OPENAI_API_KEY chat_response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": message}, ] ) response_text = chat_response['choices'][0]['message']['content'].strip() url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}" headers = { "xi-api-key": ELEVEN_LABS_API_KEY, "Content-Type": "application/json" } data = { "text": response_text, "voice_settings": { "stability": 0.75, "similarity_boost": 0.75 } } response = requests.post(url, json=data, headers=headers) if response.status_code != 200: return response_text, None audio_path = "response.mp3" with open(audio_path, "wb") as f: f.write(response.content) return response_text, audio_path except Exception as e: return f"Error: {str(e)}", None # Create Gradio interface demo = gr.Blocks(title="AI Therapist") with demo: gr.Markdown("# AI Virtual Therapist") with gr.Tab("Text Emotion Analysis"): text_input = gr.Textbox(label="Enter text") text_button = gr.Button("Analyze Text Emotion") text_output = gr.Textbox(label="Emotion Analysis Result") text_button.click(analyze_text_emotion, inputs=text_input, outputs=text_output) with gr.Tab("Voice Emotion Analysis"): audio_input = gr.Audio(label="Upload Audio", type="numpy") audio_button = gr.Button("Analyze Voice Emotion") audio_output = gr.Textbox(label="Voice Analysis Result") audio_button.click(analyze_voice_emotion, inputs=audio_input, outputs=audio_output) with gr.Tab("Chat with TTS"): chat_input = gr.Textbox(label="Enter your message") chat_button = gr.Button("Send Message") chat_output = gr.Textbox(label="Assistant Response") audio_output = gr.Audio(label="Voice Response") chat_button.click(chat_and_tts, inputs=chat_input, outputs=[chat_output, audio_output]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)