Spaces:

nikhildsst
/

AI_Virtual

Running

File size: 4,818 Bytes

import gradio as gr
from fastapi import FastAPI
import librosa
import openai
from transformers import pipeline
import requests
import os
from pydantic import BaseModel
import numpy as np

# Initialize FastAPI
app = FastAPI()

# Initialize emotion classifier
text_emotion_classifier = pipeline("text-classification", 
                                 model="bhadresh-savani/distilbert-base-uncased-emotion",
                                 device=-1)

# Environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
VOICE_ID = os.getenv("VOICE_ID", "9BWtsMINqrJLrRacOk9x")

def analyze_text_emotion(text):
    try:
        emotion_result = text_emotion_classifier(text)
        emotion_data = emotion_result[0]
        return f"Emotion: {emotion_data['label']}\nConfidence: {emotion_data['score']:.2f}"
    except Exception as e:
        return f"Error: {str(e)}"


def analyze_voice_emotion(audio):
    try:
        if audio is None:
            return "Please upload an audio file"

        # Ensure audio is loaded with correct format
        sr = audio[0]
        y = audio[1]

        # Check if the audio data is already in float format; if not, convert it
        if y.dtype != 'float32':
            y = y.astype('float32')

        # Calculate features and convert numpy values to Python scalars
        pitch = float(librosa.feature.spectral_centroid(y=y, sr=sr).mean())
        intensity = float(librosa.feature.rms(y=y).mean())
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        
        # Convert tempo to Python float to avoid numpy formatting issues
        tempo = float(tempo)

        # Determine emotion based on features
        if pitch < 150 and intensity < 0.02:
            emotion = "sadness"
        elif pitch > 200 and intensity > 0.05:
            emotion = "anger"
        elif pitch > 150 and intensity < 0.03:
            emotion = "joy"
        else:
            emotion = "anxiety"

        # Format the output using Python floats instead of numpy values
        return "Emotion: {}\nPitch: {:.2f}\nIntensity: {:.2f}\nTempo: {:.2f}".format(
            emotion, pitch, intensity, tempo
        )
    except Exception as e:
        return f"Error analyzing audio: {str(e)}"


def chat_and_tts(message):
    try:
        if not OPENAI_API_KEY or not ELEVEN_LABS_API_KEY:
            return "API keys not configured", None
            
        openai.api_key = OPENAI_API_KEY
        chat_response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message},
            ]
        )
        response_text = chat_response['choices'][0]['message']['content'].strip()

        url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
        headers = {
            "xi-api-key": ELEVEN_LABS_API_KEY,
            "Content-Type": "application/json"
        }
        data = {
            "text": response_text,
            "voice_settings": {
                "stability": 0.75,
                "similarity_boost": 0.75
            }
        }
        response = requests.post(url, json=data, headers=headers)
        
        if response.status_code != 200:
            return response_text, None
            
        audio_path = "response.mp3"
        with open(audio_path, "wb") as f:
            f.write(response.content)
            
        return response_text, audio_path
    except Exception as e:
        return f"Error: {str(e)}", None

# Create Gradio interface
demo = gr.Blocks(title="AI Therapist")

with demo:
    gr.Markdown("# AI Virtual Therapist")
    
    with gr.Tab("Text Emotion Analysis"):
        text_input = gr.Textbox(label="Enter text")
        text_button = gr.Button("Analyze Text Emotion")
        text_output = gr.Textbox(label="Emotion Analysis Result")
        text_button.click(analyze_text_emotion, inputs=text_input, outputs=text_output)
    
    with gr.Tab("Voice Emotion Analysis"):
        audio_input = gr.Audio(label="Upload Audio", type="numpy")
        audio_button = gr.Button("Analyze Voice Emotion")
        audio_output = gr.Textbox(label="Voice Analysis Result")
        audio_button.click(analyze_voice_emotion, inputs=audio_input, outputs=audio_output)
    
    with gr.Tab("Chat with TTS"):
        chat_input = gr.Textbox(label="Enter your message")
        chat_button = gr.Button("Send Message")
        chat_output = gr.Textbox(label="Assistant Response")
        audio_output = gr.Audio(label="Voice Response")
        chat_button.click(chat_and_tts, inputs=chat_input, outputs=[chat_output, audio_output])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)