Spaces:
Sleeping
Sleeping
File size: 1,273 Bytes
68581e5 3531ab0 c1beabf ad13324 ac24d92 68581e5 c092d4c 0ab5719 68581e5 4343090 52fce42 68581e5 c092d4c 68581e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# import gradio as gr
# import os
# gr.load("models/google/gemma-1.1-7b-it", hf_token=os.environ.get("YOUR_API_TOKEN"), streaming=True).launch()
import gradio as gr
import os
os.system('pip install openai')
from openai import OpenAI
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1",
api_key=os.environ.get('YOUR_API_TOKEN')
)
def predict(message, history, test=""):
print("1 ", message)
print("2 ", history)
history_openai_format = []
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human })
history_openai_format.append({"role": "assistant", "content":assistant})
history_openai_format.append({"role": "user", "content": message})
response = client.chat.completions.create(model='meta-llama/Meta-Llama-3-8B-Instruct',
# response = client.chat.completions.create(model='nvidia/Llama3-ChatQA-1.5-8B',
messages= history_openai_format,
temperature=0.7,
stream=True,
max_tokens=3000)
partial_message = ""
for chunk in response:
if chunk.choices[0].delta.content is not None:
partial_message = partial_message + chunk.choices[0].delta.content
yield partial_message
gr.ChatInterface(predict).launch() |