File size: 1,273 Bytes
68581e5
 
 
 
 
3531ab0
c1beabf
ad13324
 
ac24d92
68581e5
 
 
 
 
c092d4c
0ab5719
 
68581e5
 
 
 
 
 
4343090
52fce42
68581e5
 
c092d4c
 
68581e5
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# import gradio as gr
# import os

# gr.load("models/google/gemma-1.1-7b-it", hf_token=os.environ.get("YOUR_API_TOKEN"), streaming=True).launch()

import gradio as gr
import os
os.system('pip install openai')
from openai import OpenAI

client = OpenAI(
  base_url="https://api-inference.huggingface.co/v1",
  api_key=os.environ.get('YOUR_API_TOKEN')
) 

def predict(message, history, test=""):
    print("1 ", message)
    print("2 ", history)
    history_openai_format = []
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human })
        history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": message})
  
    response = client.chat.completions.create(model='meta-llama/Meta-Llama-3-8B-Instruct',
    # response = client.chat.completions.create(model='nvidia/Llama3-ChatQA-1.5-8B',
    messages= history_openai_format,
    temperature=0.7,
    stream=True,
    max_tokens=3000)

    partial_message = ""
    for chunk in response:
        if chunk.choices[0].delta.content is not None:
              partial_message = partial_message + chunk.choices[0].delta.content
              yield partial_message

gr.ChatInterface(predict).launch()