File size: 5,897 Bytes
5f325ee
6ee2c17
5f325ee
 
 
 
 
 
 
 
 
1c74966
5f325ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee2c17
5f325ee
6ee2c17
5f325ee
 
6ee2c17
5f325ee
 
 
 
6ee2c17
5f325ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f159450
 
5f325ee
 
5d22321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f159450
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import gradio as gr
import requests
import io
import re
from PIL import Image
from groq import Groq

# Set Your API Keys
#  Use environment variables securely
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HF_API_KEY = os.getenv("HF_API_KEY")

if not GROQ_API_KEY or not HF_API_KEY:
    raise ValueError("GROQ_API_KEY and HF_TOKEN must be set in the environment variables.")
# Initialize Groq API client
client = Groq(api_key=GROQ_API_KEY)

# Use a Public Hugging Face Image Model
HF_IMAGE_MODEL = "stabilityai/stable-diffusion-2-1"

# Function 1: Tamil Audio to Tamil Text (Transcription)
def transcribe_audio(audio_path):
    if not audio_path:
        return "Error: Please upload an audio file."

    try:
        with open(audio_path, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), file.read()),
                model="whisper-large-v3",
                language="ta",  # Tamil
                response_format="verbose_json",
            )
        return transcription.text.strip()

    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Function 2: Tamil Text to English Translation
def translate_tamil_to_english(tamil_text):
    if not tamil_text:
        return "Error: Please enter Tamil text for translation."

    prompt = f"Translate this Tamil text to English: {tamil_text}\nGive only the translated text as output."

    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",  # Groq-supported model
            messages=[{"role": "user", "content": prompt}],
        )
        translated_text = response.choices[0].message.content.strip()

        #  Fix: Remove unwanted XML tags like <think></think>
        translated_text = re.sub(r"</?think>", "", translated_text).strip()
        return translated_text

    except Exception as e:
        return f"Error in translation: {str(e)}"

# Function 3: English Text to Image Generation (Hugging Face)
def generate_image(english_text):
    if not english_text:
        return "Error: Please enter a description for image generation."

    try:
        headers = {"Authorization": f"Bearer {HF_API_KEY}"}
        payload = {"inputs": english_text}

        response = requests.post(f"https://api-inference.huggingface.co/models/{HF_IMAGE_MODEL}",
                                 headers=headers, json=payload)
        response.raise_for_status()
        image_bytes = response.content

        #  Check if the response is a valid image
        if not image_bytes:
            return "Error: Received empty response from API."

        return Image.open(io.BytesIO(image_bytes))

    except Exception as e:
        return f"Error in image generation: {str(e)}"

# Function 4: English Text to AI-Generated Text

def generate_text(english_text):
    if not english_text:
        return "Please enter a prompt."

    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",  #  Ensure you're using a valid model
            messages=[{"role": "user", "content": english_text}],
        )

        # Extract the response content
        generated_text = response.choices[0].message.content.strip()

        # Remove unwanted XML-like tags
        cleaned_text = re.sub(r"</?think>", "", generated_text).strip()

        return cleaned_text

    except Exception as e:
        return f"Error in text generation: {str(e)}"

# Combined Function to Process All Steps
def process_audio(audio_path):
    # Step 1: Tamil Audio β†’ Tamil Text
    tamil_text = transcribe_audio(audio_path)
    if "Error" in tamil_text:
        return tamil_text, None, None, None

    # Step 2: Tamil Text β†’ English Text
    english_text = translate_tamil_to_english(tamil_text)
    if "Error" in english_text:
        return tamil_text, english_text, None, None

    # Step 3: English Text β†’ Image
    image = generate_image(english_text)
    if isinstance(image, str) and "Error" in image:
        return tamil_text, english_text, None, None

    # Step 4: English Text β†’ AI-Generated Text
    generated_text = generate_text(english_text)
    return tamil_text, english_text, image, generated_text


# Create Gradio Interface
def clear_outputs():
    return "", "", None, ""

# --- Creative Gradio Interface ---
with gr.Blocks() as demo:
    # Title & Subtitle with Emojis
    gr.Markdown("### 🎨 **TransArt: Multimodal Tamil Audio Experience**")
    gr.Markdown("**Transform Tamil audio into captivating content** – from transcription and English translation to stunning AI-generated images and creative narratives! 🌟")

    # Visual Separator
    gr.Markdown("---")

    # Row for Audio Input + Buttons
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="🎢 Upload Tamil Audio")
        with gr.Column():
            submit_button = gr.Button("✨ Submit")
            clear_button = gr.Button("🧹 Clear")

    # Another Separator for clarity
    gr.Markdown("---")

    # Row for Transcribed Tamil (left) & Translated English (right)
    with gr.Row():
        transcribed_text = gr.Textbox(label="πŸ“ Transcribed Tamil Text")
        translated_text = gr.Textbox(label="🌐 Translated English Text")

    # Separator
    gr.Markdown("---")

    # Row for Generated Image (left) & Generated Text (right)
    with gr.Row():
        generated_image = gr.Image(label="🎨 Generated AI Image")
        generated_text = gr.Textbox(label="πŸ’‘ Generated English Text")

    # Button actions
    submit_button.click(
        fn=process_audio,
        inputs=audio_input,
        outputs=[transcribed_text, translated_text, generated_image, generated_text],
    )
    clear_button.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[transcribed_text, translated_text, generated_image, generated_text],
    )

demo.launch()