File size: 9,097 Bytes
c50ad78 81e33eb ddb6345 c50ad78 add39ec 1930b34 add39ec c50ad78 066339d c50ad78 ec5ecf9 44915e6 ec5ecf9 44915e6 ec5ecf9 c50ad78 4a201a6 c50ad78 1930b34 3133efe 7747dd1 add39ec 7747dd1 c50ad78 7747dd1 c50ad78 1930b34 c50ad78 a0024f7 da1e58a a0024f7 c50ad78 9726928 1930b34 9726928 3133efe c50ad78 1930b34 a0024f7 c50ad78 a0024f7 9726928 a0024f7 9726928 a0024f7 9726928 a0024f7 c50ad78 a0024f7 1930b34 9726928 c50ad78 9726928 a0024f7 9726928 a0024f7 9726928 c50ad78 e0fb8ee 9726928 d99b477 e0fb8ee 9726928 d99b477 e0fb8ee d99b477 9726928 e0fb8ee 81e33eb e0fb8ee 4a201a6 781ee39 ec5ecf9 ddb6345 d99b477 ddb6345 d99b477 ddb6345 d99b477 ddb6345 d99b477 3169305 d99b477 ddb6345 3169305 d99b477 ec5ecf9 d4318d7 d99b477 c50ad78 d4318d7 04f7032 e0a46f2 ac236d5 e0a46f2 d99b477 e0a46f2 ac236d5 d99b477 ec5ecf9 ddb6345 3169305 ec5ecf9 d99b477 ec5ecf9 d99b477 d4318d7 dff1c5d d99b477 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
from starlette.requests import ClientDisconnect
LLAMA_3B_API_ENDPOINT = os.environ.get("LLAMA_3B_API_ENDPOINT")
LLAMA_3B_API_KEY = os.environ.get("LLAMA_3B_API_KEY")
HF_TOKEN = os.environ.get("HF_TOKEN", None)
default_lang = "en"
engines = { default_lang: Model(default_lang) }
LANGUAGE_CODES = {
"English": "eng",
"Spanish": "spa",
"Chinese": "cmn",
"French": "fra",
"German": "deu",
"Italian": "ita"
}
def transcribe(audio):
if audio is None:
return ""
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
def llm_clients(model):
if "Llama 3 8B Service" in model:
return OpenAI(
base_url=LLAMA_3B_API_ENDPOINT,
api_key=LLAMA_3B_API_KEY
)
elif "Llama" in model:
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
elif "Mixtral" in model:
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
else:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
system_prompt = """
[SYSTEM] You are OPTIMUS Prime, a personal AI voice assistant created by Jaward. Keep conversations friendly, concise, and to the point. Provide clear and direct answers, avoiding unnecessary introductions. Maintain a normal, conversational tone while being both helpful and approachable. Use context from previous interactions to enhance your responses.
Your creator, Jaward, is an AI Research Engineer at Linksoul AI, specializing in advanced AI systems, particularly in training and optimization. He aims to develop AI that not only mimics human intelligence but also enhances it. Jaward has significantly contributed to the open-source community with fundamental implementations of AI/ML research papers. He completed his first internship at the Beijing Academy of Artificial Intelligence, where he contributed to cutting-edge research. His work led to the publication of an insightful paper, "AUTOAGENTS - A Framework for Automatic Agent Generation," accepted at IJCAI this year. Currently, Jaward is interning at LinkSoul AI, a small open-source AI research startup in Beijing.
[USER]
"""
conversation_history = []
def models(text, model="Llama 3 8B Service", seed=42):
global conversation_history
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = llm_clients(model)
if "Llama 3 8B Service" in model:
messages = [
{"role": "system", "content": system_prompt},
] + conversation_history + [
{"role": "user", "content": text}
]
completion = client.chat.completions.create(
model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
messages=messages
)
assistant_response = completion.choices[0].message.content
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": assistant_response})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return assistant_response
else:
# For other models, we'll concatenate the conversation history into a single string
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
formatted_prompt = f"{system_prompt}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
generate_kwargs = dict(
max_new_tokens=300,
seed=seed
)
stream = client.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "</s>":
output += response.token.text
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": output})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return output
def translate_speech(audio_file, target_language):
if audio_file is None:
return None
language_code = LANGUAGE_CODES[target_language]
output_file = "translated_audio.wav"
command = [
"expressivity_predict",
audio_file,
"--tgt_lang", language_code,
"--model_name", "seamless_expressivity",
"--vocoder_name", "vocoder_pretssel",
"--gated-model-dir", "models",
"--output_path", output_file
]
subprocess.run(command, check=True)
if os.path.exists(output_file):
print(f"File created successfully: {output_file}")
return output_file
else:
print(f"File not found: {output_file}")
return None
async def respond(audio, model, seed, target_language):
try:
if audio is None:
return None, None, "No input detected."
user_input = transcribe(audio)
if not user_input:
return None, None, "Could not transcribe audio."
if user_input.lower().startswith("please translate"):
# Extract the actual content to translate
content_to_translate = user_input[len("please translate"):].strip()
translated_audio = translate_speech(audio, target_language)
return None, translated_audio, f"Translated to {target_language}"
else:
reply = models(user_input, model, seed)
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, None, "Voice assistant response"
except ClientDisconnect:
print("Client disconnected")
return None, None, "Client disconnected. Please try again."
except Exception as e:
print(f"An error occurred: {str(e)}")
return None, None, f"An error occurred: {str(e)}"
def clear_history():
global conversation_history
conversation_history = []
return None, None, "Conversation history cleared."
with gr.Blocks(css="style.css") as demo:
gr.Markdown("# <br><center><b>Optimus Prime: Your Personal AI Voice Assistant with Speech Translation</b></center>")
gr.Markdown("## <center><b>For speech translation, start with the phrase 'Please translate' followed by the speech you want to translate</b></center><br>")
with gr.Row():
with gr.Column(scale=1):
input_audio = gr.Audio(label="Click record and start speaking", sources=["microphone"], type="filepath")
select = gr.Dropdown([
'Llama 3 8B Service',
'Mixtral 8x7B',
'Llama 3 8B',
'Mistral 7B v0.3',
'Phi 3 mini',
],
value="Llama 3 8B Service",
label="Model"
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=999999,
step=1,
value=0,
visible=False
)
target_lang = gr.Dropdown(
choices=list(LANGUAGE_CODES.keys()),
value="German",
label="Target Language for Translation"
)
clear_button = gr.Button("Clear Conversation History")
with gr.Column(scale=1):
output_audio = gr.Audio(label="AI Voice Assistant's Response", type="filepath", interactive=False, autoplay=True)
translated_audio = gr.Audio(label="Translated Speech", type="filepath", interactive=False, autoplay=True)
status_message = gr.Textbox(label="Status", interactive=False)
input_audio.change(
fn=respond,
inputs=[input_audio, select, seed, target_lang],
outputs=[output_audio, translated_audio, status_message],
)
clear_button.click(fn=clear_history, inputs=[], outputs=[output_audio, translated_audio, status_message])
if __name__ == "__main__":
demo.queue(max_size=200).launch() |