Spaces:

shukdevdatta123
/

WaveTalk

Sleeping

App Files Files Community

WaveTalk / v1.txt

shukdevdatta123

Create v1.txt

a2d4059 verified 9 days ago

raw

history blame contribute delete

8.8 kB

	import base64
	import tempfile
	import os
	import requests
	import gradio as gr
	from openai import OpenAI

	# Available voices for audio generation
	VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"]

	def process_text_input(api_key, text_prompt, selected_voice):
	"""Generate audio response from text input"""
	try:
	# Initialize OpenAI client with the provided API key
	client = OpenAI(api_key=api_key)

	completion = client.chat.completions.create(
	model="gpt-4o-audio-preview",
	modalities=["text", "audio"],
	audio={"voice": selected_voice, "format": "wav"},
	messages=[
	{
	"role": "user",
	"content": text_prompt
	}
	]
	)

	# Save the audio to a temporary file
	wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
	temp_path = tempfile.mktemp(suffix=".wav")
	with open(temp_path, "wb") as f:
	f.write(wav_bytes)

	# Get the text response
	text_response = completion.choices[0].message.content

	return text_response, temp_path
	except Exception as e:
	return f"Error: {str(e)}", None

	def process_audio_input(api_key, audio_path, text_prompt, selected_voice):
	"""Process audio input and generate a response"""
	try:
	if not audio_path:
	return "Please upload or record audio first.", None

	# Initialize OpenAI client with the provided API key
	client = OpenAI(api_key=api_key)

	# Read audio file and encode to base64
	with open(audio_path, "rb") as audio_file:
	audio_data = audio_file.read()
	encoded_audio = base64.b64encode(audio_data).decode('utf-8')

	# Create message content with both text and audio
	message_content = []

	if text_prompt:
	message_content.append({
	"type": "text",
	"text": text_prompt
	})

	message_content.append({
	"type": "input_audio",
	"input_audio": {
	"data": encoded_audio,
	"format": "wav"
	}
	})

	# Call OpenAI API
	completion = client.chat.completions.create(
	model="gpt-4o-audio-preview",
	modalities=["text", "audio"],
	audio={"voice": selected_voice, "format": "wav"},
	messages=[
	{
	"role": "user",
	"content": message_content
	}
	]
	)

	# Save the audio response
	wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
	temp_path = tempfile.mktemp(suffix=".wav")
	with open(temp_path, "wb") as f:
	f.write(wav_bytes)

	# Get the text response
	text_response = completion.choices[0].message.content

	return text_response, temp_path
	except Exception as e:
	return f"Error: {str(e)}", None

	def download_example_audio():
	"""Download an example audio file for testing"""
	try:
	url = "https://cdn.openai.com/API/docs/audio/alloy.wav"
	response = requests.get(url)
	response.raise_for_status()

	# Save to a temporary file
	temp_path = tempfile.mktemp(suffix=".wav")
	with open(temp_path, "wb") as f:
	f.write(response.content)

	return temp_path
	except Exception as e:
	return None

	def use_example_audio():
	"""Load example audio for the interface"""
	audio_path = download_example_audio()
	return audio_path

	# Create Gradio Interface
	with gr.Blocks(title="OpenAI Audio Chat App") as app:
	gr.Markdown("# OpenAI Audio Chat App")
	gr.Markdown("Interact with GPT-4o audio model through text and audio inputs")

	# API Key input (used across all tabs)
	api_key = gr.Textbox(
	label="OpenAI API Key",
	placeholder="Enter your OpenAI API key here",
	type="password"
	)

	with gr.Tab("Text to Audio"):
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text Prompt",
	placeholder="Enter your question or prompt here...",
	lines=3
	)
	text_voice = gr.Dropdown(
	choices=VOICES,
	value="alloy",
	label="Voice"
	)
	text_submit = gr.Button("Generate Response")

	with gr.Column():
	text_output = gr.Textbox(label="AI Response (Text)", lines=5)
	audio_output = gr.Audio(label="AI Response (Audio)")

	text_submit.click(
	fn=process_text_input,
	inputs=[api_key, text_input, text_voice],
	outputs=[text_output, audio_output]
	)

	with gr.Tab("Audio Input to Audio Response"):
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Audio Input",
	type="filepath",
	sources=["microphone", "upload"]
	)
	example_btn = gr.Button("Use Example Audio")

	accompanying_text = gr.Textbox(
	label="Accompanying Text (Optional)",
	placeholder="Add any text context or question about the audio...",
	lines=2
	)
	audio_voice = gr.Dropdown(
	choices=VOICES,
	value="alloy",
	label="Response Voice"
	)
	audio_submit = gr.Button("Process Audio & Generate Response")

	with gr.Column():
	audio_text_output = gr.Textbox(label="AI Response (Text)", lines=5)
	audio_audio_output = gr.Audio(label="AI Response (Audio)")

	audio_submit.click(
	fn=process_audio_input,
	inputs=[api_key, audio_input, accompanying_text, audio_voice],
	outputs=[audio_text_output, audio_audio_output]
	)

	example_btn.click(
	fn=use_example_audio,
	inputs=[],
	outputs=[audio_input]
	)

	with gr.Tab("Voice Samples"):
	gr.Markdown("## Listen to samples of each voice")

	def generate_voice_sample(api_key, voice_type):
	try:
	if not api_key:
	return "Please enter your OpenAI API key first.", None

	client = OpenAI(api_key=api_key)
	completion = client.chat.completions.create(
	model="gpt-4o-audio-preview",
	modalities=["text", "audio"],
	audio={"voice": voice_type, "format": "wav"},
	messages=[
	{
	"role": "user",
	"content": f"This is a sample of the {voice_type} voice. It has its own unique tone and character."
	}
	]
	)

	# Save the audio to a temporary file
	wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
	temp_path = tempfile.mktemp(suffix=".wav")
	with open(temp_path, "wb") as f:
	f.write(wav_bytes)

	return f"Sample generated with voice: {voice_type}", temp_path
	except Exception as e:
	return f"Error: {str(e)}", None

	with gr.Row():
	sample_voice = gr.Dropdown(
	choices=VOICES,
	value="alloy",
	label="Select Voice Sample"
	)
	sample_btn = gr.Button("Generate Sample")

	with gr.Row():
	sample_text = gr.Textbox(label="Status")
	sample_audio = gr.Audio(label="Voice Sample")

	sample_btn.click(
	fn=generate_voice_sample,
	inputs=[api_key, sample_voice],
	outputs=[sample_text, sample_audio]
	)

	gr.Markdown("""
	## Notes:
	- You must provide your OpenAI API key in the field above
	- The model used is `gpt-4o-audio-preview`
	- Audio inputs should be in WAV format
	- Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse
	""")

	if __name__ == "__main__":
	app.launch()