File size: 4,818 Bytes
7085a87
358065e
7085a87
 
 
 
 
 
 
 
 
 
 
 
 
 
358065e
7085a87
358065e
7085a87
 
 
 
 
 
 
 
 
 
 
 
72c2dd7
7085a87
 
358065e
 
72c2dd7
 
7085a87
72c2dd7
 
 
 
 
 
9e9a95b
7085a87
 
 
9e9a95b
 
 
7085a87
72c2dd7
7085a87
 
 
 
 
 
 
 
 
9e9a95b
 
 
 
7085a87
358065e
7085a87
72c2dd7
7085a87
 
358065e
 
 
7085a87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358065e
 
 
7085a87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358065e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
from fastapi import FastAPI
import librosa
import openai
from transformers import pipeline
import requests
import os
from pydantic import BaseModel
import numpy as np

# Initialize FastAPI
app = FastAPI()

# Initialize emotion classifier
text_emotion_classifier = pipeline("text-classification", 
                                 model="bhadresh-savani/distilbert-base-uncased-emotion",
                                 device=-1)

# Environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
VOICE_ID = os.getenv("VOICE_ID", "9BWtsMINqrJLrRacOk9x")

def analyze_text_emotion(text):
    try:
        emotion_result = text_emotion_classifier(text)
        emotion_data = emotion_result[0]
        return f"Emotion: {emotion_data['label']}\nConfidence: {emotion_data['score']:.2f}"
    except Exception as e:
        return f"Error: {str(e)}"


def analyze_voice_emotion(audio):
    try:
        if audio is None:
            return "Please upload an audio file"

        # Ensure audio is loaded with correct format
        sr = audio[0]
        y = audio[1]

        # Check if the audio data is already in float format; if not, convert it
        if y.dtype != 'float32':
            y = y.astype('float32')

        # Calculate features and convert numpy values to Python scalars
        pitch = float(librosa.feature.spectral_centroid(y=y, sr=sr).mean())
        intensity = float(librosa.feature.rms(y=y).mean())
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        
        # Convert tempo to Python float to avoid numpy formatting issues
        tempo = float(tempo)

        # Determine emotion based on features
        if pitch < 150 and intensity < 0.02:
            emotion = "sadness"
        elif pitch > 200 and intensity > 0.05:
            emotion = "anger"
        elif pitch > 150 and intensity < 0.03:
            emotion = "joy"
        else:
            emotion = "anxiety"

        # Format the output using Python floats instead of numpy values
        return "Emotion: {}\nPitch: {:.2f}\nIntensity: {:.2f}\nTempo: {:.2f}".format(
            emotion, pitch, intensity, tempo
        )
    except Exception as e:
        return f"Error analyzing audio: {str(e)}"


def chat_and_tts(message):
    try:
        if not OPENAI_API_KEY or not ELEVEN_LABS_API_KEY:
            return "API keys not configured", None
            
        openai.api_key = OPENAI_API_KEY
        chat_response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message},
            ]
        )
        response_text = chat_response['choices'][0]['message']['content'].strip()

        url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
        headers = {
            "xi-api-key": ELEVEN_LABS_API_KEY,
            "Content-Type": "application/json"
        }
        data = {
            "text": response_text,
            "voice_settings": {
                "stability": 0.75,
                "similarity_boost": 0.75
            }
        }
        response = requests.post(url, json=data, headers=headers)
        
        if response.status_code != 200:
            return response_text, None
            
        audio_path = "response.mp3"
        with open(audio_path, "wb") as f:
            f.write(response.content)
            
        return response_text, audio_path
    except Exception as e:
        return f"Error: {str(e)}", None

# Create Gradio interface
demo = gr.Blocks(title="AI Therapist")

with demo:
    gr.Markdown("# AI Virtual Therapist")
    
    with gr.Tab("Text Emotion Analysis"):
        text_input = gr.Textbox(label="Enter text")
        text_button = gr.Button("Analyze Text Emotion")
        text_output = gr.Textbox(label="Emotion Analysis Result")
        text_button.click(analyze_text_emotion, inputs=text_input, outputs=text_output)
    
    with gr.Tab("Voice Emotion Analysis"):
        audio_input = gr.Audio(label="Upload Audio", type="numpy")
        audio_button = gr.Button("Analyze Voice Emotion")
        audio_output = gr.Textbox(label="Voice Analysis Result")
        audio_button.click(analyze_voice_emotion, inputs=audio_input, outputs=audio_output)
    
    with gr.Tab("Chat with TTS"):
        chat_input = gr.Textbox(label="Enter your message")
        chat_button = gr.Button("Send Message")
        chat_output = gr.Textbox(label="Assistant Response")
        audio_output = gr.Audio(label="Voice Response")
        chat_button.click(chat_and_tts, inputs=chat_input, outputs=[chat_output, audio_output])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)