Spaces:
Sleeping
Sleeping
import base64 | |
import tempfile | |
import os | |
import requests | |
import gradio as gr | |
from openai import OpenAI | |
# Available voices for audio generation | |
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"] | |
def process_text_input(api_key, text_prompt, selected_voice): | |
"""Generate audio response from text input""" | |
try: | |
# Initialize OpenAI client with the provided API key | |
client = OpenAI(api_key=api_key) | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": selected_voice, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": text_prompt | |
} | |
] | |
) | |
# Save the audio to a temporary file | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
# Get the text response | |
text_response = completion.choices[0].message.content | |
return text_response, temp_path | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
def process_audio_input(api_key, audio_path, text_prompt, selected_voice): | |
"""Process audio input and generate a response""" | |
try: | |
if not audio_path: | |
return "Please upload or record audio first.", None | |
# Initialize OpenAI client with the provided API key | |
client = OpenAI(api_key=api_key) | |
# Read audio file and encode to base64 | |
with open(audio_path, "rb") as audio_file: | |
audio_data = audio_file.read() | |
encoded_audio = base64.b64encode(audio_data).decode('utf-8') | |
# Create message content with both text and audio | |
message_content = [] | |
if text_prompt: | |
message_content.append({ | |
"type": "text", | |
"text": text_prompt | |
}) | |
message_content.append({ | |
"type": "input_audio", | |
"input_audio": { | |
"data": encoded_audio, | |
"format": "wav" | |
} | |
}) | |
# Call OpenAI API | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": selected_voice, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": message_content | |
} | |
] | |
) | |
# Save the audio response | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
# Get the text response | |
text_response = completion.choices[0].message.content | |
return text_response, temp_path | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
def download_example_audio(): | |
"""Download an example audio file for testing""" | |
try: | |
url = "https://cdn.openai.com/API/docs/audio/alloy.wav" | |
response = requests.get(url) | |
response.raise_for_status() | |
# Save to a temporary file | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(response.content) | |
return temp_path | |
except Exception as e: | |
return None | |
def use_example_audio(): | |
"""Load example audio for the interface""" | |
audio_path = download_example_audio() | |
return audio_path | |
# Create Gradio Interface | |
with gr.Blocks(title="OpenAI Audio Chat App") as app: | |
gr.Markdown("# OpenAI Audio Chat App") | |
gr.Markdown("Interact with GPT-4o audio model through text and audio inputs") | |
# API Key input (used across all tabs) | |
api_key = gr.Textbox( | |
label="OpenAI API Key", | |
placeholder="Enter your OpenAI API key here", | |
type="password" | |
) | |
with gr.Tab("Text to Audio"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Text Prompt", | |
placeholder="Enter your question or prompt here...", | |
lines=3 | |
) | |
text_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Voice" | |
) | |
text_submit = gr.Button("Generate Response") | |
with gr.Column(): | |
text_output = gr.Textbox(label="AI Response (Text)", lines=5) | |
audio_output = gr.Audio(label="AI Response (Audio)") | |
text_submit.click( | |
fn=process_text_input, | |
inputs=[api_key, text_input, text_voice], | |
outputs=[text_output, audio_output] | |
) | |
with gr.Tab("Audio Input to Audio Response"): | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
label="Audio Input", | |
type="filepath", | |
sources=["microphone", "upload"] | |
) | |
example_btn = gr.Button("Use Example Audio") | |
accompanying_text = gr.Textbox( | |
label="Accompanying Text (Optional)", | |
placeholder="Add any text context or question about the audio...", | |
lines=2 | |
) | |
audio_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Response Voice" | |
) | |
audio_submit = gr.Button("Process Audio & Generate Response") | |
with gr.Column(): | |
audio_text_output = gr.Textbox(label="AI Response (Text)", lines=5) | |
audio_audio_output = gr.Audio(label="AI Response (Audio)") | |
audio_submit.click( | |
fn=process_audio_input, | |
inputs=[api_key, audio_input, accompanying_text, audio_voice], | |
outputs=[audio_text_output, audio_audio_output] | |
) | |
example_btn.click( | |
fn=use_example_audio, | |
inputs=[], | |
outputs=[audio_input] | |
) | |
with gr.Tab("Voice Samples"): | |
gr.Markdown("## Listen to samples of each voice") | |
def generate_voice_sample(api_key, voice_type): | |
try: | |
if not api_key: | |
return "Please enter your OpenAI API key first.", None | |
client = OpenAI(api_key=api_key) | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": voice_type, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"This is a sample of the {voice_type} voice. It has its own unique tone and character." | |
} | |
] | |
) | |
# Save the audio to a temporary file | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
return f"Sample generated with voice: {voice_type}", temp_path | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
with gr.Row(): | |
sample_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Select Voice Sample" | |
) | |
sample_btn = gr.Button("Generate Sample") | |
with gr.Row(): | |
sample_text = gr.Textbox(label="Status") | |
sample_audio = gr.Audio(label="Voice Sample") | |
sample_btn.click( | |
fn=generate_voice_sample, | |
inputs=[api_key, sample_voice], | |
outputs=[sample_text, sample_audio] | |
) | |
gr.Markdown(""" | |
## Notes: | |
- You must provide your OpenAI API key in the field above | |
- The model used is `gpt-4o-audio-preview` | |
- Audio inputs should be in WAV format | |
- Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse | |
""") | |
if __name__ == "__main__": | |
app.launch() |