Spaces:

Jaward
/

optimus

Running

File size: 9,097 Bytes

c50ad78
 
 
 
 
 
 
 
 
 
 
81e33eb
ddb6345
c50ad78
add39ec
 
1930b34
add39ec
c50ad78
066339d
c50ad78
ec5ecf9
 
 
44915e6
ec5ecf9
 
44915e6
ec5ecf9
 
c50ad78
4a201a6
 
c50ad78
 
 
 
 
1930b34
3133efe
7747dd1
add39ec
 
7747dd1
c50ad78
 
 
 
 
 
7747dd1
 
c50ad78
 
 
 
 
 
 
1930b34
 
 
 
c50ad78
 
 
a0024f7
 
da1e58a
a0024f7
c50ad78
 
9726928
1930b34
9726928
3133efe
c50ad78
1930b34
a0024f7
c50ad78
 
 
 
 
 
a0024f7
9726928
a0024f7
 
 
9726928
a0024f7
 
 
9726928
a0024f7
c50ad78
a0024f7
 
1930b34
9726928
c50ad78
 
 
 
 
 
 
 
 
 
9726928
a0024f7
 
 
9726928
a0024f7
 
 
9726928
c50ad78
 
e0fb8ee
 
 
9726928
d99b477
e0fb8ee
9726928
d99b477
 
e0fb8ee
d99b477
 
 
 
 
 
9726928
e0fb8ee
81e33eb
e0fb8ee
 
 
 
 
4a201a6
781ee39
ec5ecf9
ddb6345
 
d99b477
 
ddb6345
 
d99b477
 
ddb6345
d99b477
 
 
 
ddb6345
 
 
 
 
 
d99b477
3169305
 
d99b477
ddb6345
3169305
d99b477
ec5ecf9
d4318d7
 
 
d99b477
c50ad78
d4318d7
04f7032
 
e0a46f2
 
 
ac236d5
e0a46f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d99b477
 
 
 
 
e0a46f2
 
 
ac236d5
 
d99b477
ec5ecf9
ddb6345
3169305
ec5ecf9
d99b477
ec5ecf9
 
d99b477
d4318d7
dff1c5d
d99b477

import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
from starlette.requests import ClientDisconnect

LLAMA_3B_API_ENDPOINT = os.environ.get("LLAMA_3B_API_ENDPOINT")
LLAMA_3B_API_KEY = os.environ.get("LLAMA_3B_API_KEY")
HF_TOKEN = os.environ.get("HF_TOKEN", None)

default_lang = "en"
engines = { default_lang: Model(default_lang) }

LANGUAGE_CODES = {
    "English": "eng",
    "Spanish": "spa",
    "Chinese": "cmn",
    "French": "fra",
    "German": "deu",
    "Italian": "ita"
}

def transcribe(audio):
    if audio is None:
        return ""
    lang = "en"
    model = engines[lang]
    text = model.stt_file(audio)[0]
    return text

def llm_clients(model):
    if "Llama 3 8B Service" in model:
        return OpenAI(
            base_url=LLAMA_3B_API_ENDPOINT,
            api_key=LLAMA_3B_API_KEY
        )
    elif "Llama" in model:
        return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
    elif "Mistral" in model:
        return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
    elif "Phi" in model:
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
    elif "Mixtral" in model:
        return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
    else: 
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")

def randomize_seed_fn(seed: int) -> int:
    seed = random.randint(0, 999999)
    return seed

system_prompt = """
[SYSTEM] You are OPTIMUS Prime, a personal AI voice assistant created by Jaward. Keep conversations friendly, concise, and to the point. Provide clear and direct answers, avoiding unnecessary introductions. Maintain a normal, conversational tone while being both helpful and approachable. Use context from previous interactions to enhance your responses.

Your creator, Jaward, is an AI Research Engineer at Linksoul AI, specializing in advanced AI systems, particularly in training and optimization. He aims to develop AI that not only mimics human intelligence but also enhances it. Jaward has significantly contributed to the open-source community with fundamental implementations of AI/ML research papers. He completed his first internship at the Beijing Academy of Artificial Intelligence, where he contributed to cutting-edge research. His work led to the publication of an insightful paper, "AUTOAGENTS - A Framework for Automatic Agent Generation," accepted at IJCAI this year. Currently, Jaward is interning at LinkSoul AI, a small open-source AI research startup in Beijing.
[USER]
"""

conversation_history = []

def models(text, model="Llama 3 8B Service", seed=42):
    global conversation_history
    seed = int(randomize_seed_fn(seed))
    generator = torch.Generator().manual_seed(seed)  

    client = llm_clients(model)

    if "Llama 3 8B Service" in model:
        messages = [
            {"role": "system", "content": system_prompt},
        ] + conversation_history + [
            {"role": "user", "content": text}
        ]
        completion = client.chat.completions.create(
            model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
            messages=messages
        )
        assistant_response = completion.choices[0].message.content

        # Update conversation history
        conversation_history.append({"role": "user", "content": text})
        conversation_history.append({"role": "assistant", "content": assistant_response})

        # Keep only the last 10 messages to avoid token limit issues
        if len(conversation_history) > 20:
            conversation_history = conversation_history[-20:]

        return assistant_response
    else:
        # For other models, we'll concatenate the conversation history into a single string
        history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
        formatted_prompt = f"{system_prompt}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"

        generate_kwargs = dict(
            max_new_tokens=300,
            seed=seed
        )    
        stream = client.text_generation(
            formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
        output = ""
        for response in stream:
            if not response.token.text == "</s>":
                output += response.token.text

        # Update conversation history
        conversation_history.append({"role": "user", "content": text})
        conversation_history.append({"role": "assistant", "content": output})

        # Keep only the last 10 messages to avoid token limit issues
        if len(conversation_history) > 20:
            conversation_history = conversation_history[-20:]

        return output

def translate_speech(audio_file, target_language):
    if audio_file is None:
        return None

    language_code = LANGUAGE_CODES[target_language]
    output_file = "translated_audio.wav"

    command = [
        "expressivity_predict",
        audio_file,
        "--tgt_lang", language_code,
        "--model_name", "seamless_expressivity",
        "--vocoder_name", "vocoder_pretssel",
        "--gated-model-dir", "models",
        "--output_path", output_file
    ]

    subprocess.run(command, check=True)

    if os.path.exists(output_file):
        print(f"File created successfully: {output_file}")
        return output_file
    else:
        print(f"File not found: {output_file}")
        return None

async def respond(audio, model, seed, target_language):
    try:
        if audio is None:
            return None, None, "No input detected."
        
        user_input = transcribe(audio)
        if not user_input:
            return None, None, "Could not transcribe audio."
        
        if user_input.lower().startswith("please translate"):
            # Extract the actual content to translate
            content_to_translate = user_input[len("please translate"):].strip()
            translated_audio = translate_speech(audio, target_language)
            return None, translated_audio, f"Translated to {target_language}"
        else:
            reply = models(user_input, model, seed)
            communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            return tmp_path, None, "Voice assistant response"
    except ClientDisconnect:
        print("Client disconnected")
        return None, None, "Client disconnected. Please try again."
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None, f"An error occurred: {str(e)}"

def clear_history():
    global conversation_history
    conversation_history = []
    return None, None, "Conversation history cleared."

with gr.Blocks(css="style.css") as demo:
    gr.Markdown("# <br><center><b>Optimus Prime: Your Personal AI Voice Assistant with Speech Translation</b></center>")
    gr.Markdown("## <center><b>For speech translation, start with the phrase 'Please translate' followed by the speech you want to translate</b></center><br>")

    with gr.Row():
        with gr.Column(scale=1):
            input_audio = gr.Audio(label="Click record and start speaking", sources=["microphone"], type="filepath")
            select = gr.Dropdown([
                'Llama 3 8B Service',
                'Mixtral 8x7B',
                'Llama 3 8B',
                'Mistral 7B v0.3',
                'Phi 3 mini',
            ],
            value="Llama 3 8B Service",
            label="Model"
            )
            seed = gr.Slider(
                label="Seed",
                minimum=0,
                maximum=999999,
                step=1,
                value=0,
                visible=False
            )
            target_lang = gr.Dropdown(
                choices=list(LANGUAGE_CODES.keys()),
                value="German",
                label="Target Language for Translation"
            )
            clear_button = gr.Button("Clear Conversation History")

        with gr.Column(scale=1):
            output_audio = gr.Audio(label="AI Voice Assistant's Response", type="filepath", interactive=False, autoplay=True)
            translated_audio = gr.Audio(label="Translated Speech", type="filepath", interactive=False, autoplay=True)
            status_message = gr.Textbox(label="Status", interactive=False)

    input_audio.change(
        fn=respond, 
        inputs=[input_audio, select, seed, target_lang],
        outputs=[output_audio, translated_audio, status_message],
    )

    clear_button.click(fn=clear_history, inputs=[], outputs=[output_audio, translated_audio, status_message])

if __name__ == "__main__":
    demo.queue(max_size=200).launch()