WaveTalk / v1.txt
shukdevdatta123's picture
Create v1.txt
a2d4059 verified
import base64
import tempfile
import os
import requests
import gradio as gr
from openai import OpenAI
# Available voices for audio generation
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"]
def process_text_input(api_key, text_prompt, selected_voice):
"""Generate audio response from text input"""
try:
# Initialize OpenAI client with the provided API key
client = OpenAI(api_key=api_key)
completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": selected_voice, "format": "wav"},
messages=[
{
"role": "user",
"content": text_prompt
}
]
)
# Save the audio to a temporary file
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
temp_path = tempfile.mktemp(suffix=".wav")
with open(temp_path, "wb") as f:
f.write(wav_bytes)
# Get the text response
text_response = completion.choices[0].message.content
return text_response, temp_path
except Exception as e:
return f"Error: {str(e)}", None
def process_audio_input(api_key, audio_path, text_prompt, selected_voice):
"""Process audio input and generate a response"""
try:
if not audio_path:
return "Please upload or record audio first.", None
# Initialize OpenAI client with the provided API key
client = OpenAI(api_key=api_key)
# Read audio file and encode to base64
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
encoded_audio = base64.b64encode(audio_data).decode('utf-8')
# Create message content with both text and audio
message_content = []
if text_prompt:
message_content.append({
"type": "text",
"text": text_prompt
})
message_content.append({
"type": "input_audio",
"input_audio": {
"data": encoded_audio,
"format": "wav"
}
})
# Call OpenAI API
completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": selected_voice, "format": "wav"},
messages=[
{
"role": "user",
"content": message_content
}
]
)
# Save the audio response
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
temp_path = tempfile.mktemp(suffix=".wav")
with open(temp_path, "wb") as f:
f.write(wav_bytes)
# Get the text response
text_response = completion.choices[0].message.content
return text_response, temp_path
except Exception as e:
return f"Error: {str(e)}", None
def download_example_audio():
"""Download an example audio file for testing"""
try:
url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
# Save to a temporary file
temp_path = tempfile.mktemp(suffix=".wav")
with open(temp_path, "wb") as f:
f.write(response.content)
return temp_path
except Exception as e:
return None
def use_example_audio():
"""Load example audio for the interface"""
audio_path = download_example_audio()
return audio_path
# Create Gradio Interface
with gr.Blocks(title="OpenAI Audio Chat App") as app:
gr.Markdown("# OpenAI Audio Chat App")
gr.Markdown("Interact with GPT-4o audio model through text and audio inputs")
# API Key input (used across all tabs)
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="Enter your OpenAI API key here",
type="password"
)
with gr.Tab("Text to Audio"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text Prompt",
placeholder="Enter your question or prompt here...",
lines=3
)
text_voice = gr.Dropdown(
choices=VOICES,
value="alloy",
label="Voice"
)
text_submit = gr.Button("Generate Response")
with gr.Column():
text_output = gr.Textbox(label="AI Response (Text)", lines=5)
audio_output = gr.Audio(label="AI Response (Audio)")
text_submit.click(
fn=process_text_input,
inputs=[api_key, text_input, text_voice],
outputs=[text_output, audio_output]
)
with gr.Tab("Audio Input to Audio Response"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Audio Input",
type="filepath",
sources=["microphone", "upload"]
)
example_btn = gr.Button("Use Example Audio")
accompanying_text = gr.Textbox(
label="Accompanying Text (Optional)",
placeholder="Add any text context or question about the audio...",
lines=2
)
audio_voice = gr.Dropdown(
choices=VOICES,
value="alloy",
label="Response Voice"
)
audio_submit = gr.Button("Process Audio & Generate Response")
with gr.Column():
audio_text_output = gr.Textbox(label="AI Response (Text)", lines=5)
audio_audio_output = gr.Audio(label="AI Response (Audio)")
audio_submit.click(
fn=process_audio_input,
inputs=[api_key, audio_input, accompanying_text, audio_voice],
outputs=[audio_text_output, audio_audio_output]
)
example_btn.click(
fn=use_example_audio,
inputs=[],
outputs=[audio_input]
)
with gr.Tab("Voice Samples"):
gr.Markdown("## Listen to samples of each voice")
def generate_voice_sample(api_key, voice_type):
try:
if not api_key:
return "Please enter your OpenAI API key first.", None
client = OpenAI(api_key=api_key)
completion = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": voice_type, "format": "wav"},
messages=[
{
"role": "user",
"content": f"This is a sample of the {voice_type} voice. It has its own unique tone and character."
}
]
)
# Save the audio to a temporary file
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
temp_path = tempfile.mktemp(suffix=".wav")
with open(temp_path, "wb") as f:
f.write(wav_bytes)
return f"Sample generated with voice: {voice_type}", temp_path
except Exception as e:
return f"Error: {str(e)}", None
with gr.Row():
sample_voice = gr.Dropdown(
choices=VOICES,
value="alloy",
label="Select Voice Sample"
)
sample_btn = gr.Button("Generate Sample")
with gr.Row():
sample_text = gr.Textbox(label="Status")
sample_audio = gr.Audio(label="Voice Sample")
sample_btn.click(
fn=generate_voice_sample,
inputs=[api_key, sample_voice],
outputs=[sample_text, sample_audio]
)
gr.Markdown("""
## Notes:
- You must provide your OpenAI API key in the field above
- The model used is `gpt-4o-audio-preview`
- Audio inputs should be in WAV format
- Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse
""")
if __name__ == "__main__":
app.launch()