File size: 3,297 Bytes
5416384
 
 
 
 
2f813e0
8a095f7
 
4246f74
6e3b4c1
 
4246f74
5416384
 
 
4246f74
5281b2e
4246f74
 
 
 
 
 
7ff24a3
4246f74
 
42d2bef
2f813e0
 
9e36cc1
 
7fa8485
42d2bef
f555c72
7fa8485
9e36cc1
2f813e0
7ff24a3
2f813e0
5416384
7ff24a3
5416384
7ff24a3
5416384
 
 
 
 
 
7ff24a3
 
 
5416384
 
 
 
 
 
7ff24a3
 
9e36cc1
42d2bef
 
9e36cc1
dbd53de
9e36cc1
 
77f91dd
dbd53de
7ff24a3
9e36cc1
 
7ff24a3
f555c72
 
 
 
 
7ff24a3
 
f555c72
7ff24a3
f555c72
7ff24a3
 
f555c72
9e36cc1
4246f74
5416384
 
 
5281b2e
79ed0a3
6e3b4c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import requests
import streamlit as st
from llama_cpp import Llama

# βœ… Streamlit Page Config (Must be first)
st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")

# βœ… Define model path
MODEL_PATH = "./Phi-3-mini-4k-instruct-q4.gguf"
MODEL_URL = "https://huggingface.co./microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"

# βœ… Check if model exists, otherwise download
if not os.path.exists(MODEL_PATH):
    st.info("Downloading the model file. Please wait...")
    try:
        with requests.get(MODEL_URL, stream=True) as response:
            response.raise_for_status()  # Stops the script if download fails
            with open(MODEL_PATH, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        st.success("Model downloaded successfully!")
    except requests.exceptions.HTTPError as e:
        st.error("🚨 Model download failed. Please try again later.")
        st.stop()

# βœ… Load optimized model with reduced context length
try:
    if "model" not in st.session_state:
        st.session_state["model"] = Llama(
            model_path=MODEL_PATH,
            n_ctx=256,  # βœ… Lower memory usage, speeds up responses
            n_threads=2,  # Matches available vCPUs
            numa=True,
            n_batch=64  # βœ… Faster token processing
        )
except Exception as e:
    st.error("🚨 Error loading model. Please restart the application.")
    st.stop()

# 🌟 User-Friendly Chat Interface
st.title("πŸ€– Phi-3 Mini Chatbot")
st.markdown("### Ask me anything and I'll provide helpful responses!")

# Chat history
if "messages" not in st.session_state:
    st.session_state["messages"] = []

# Display chat history
for role, text in st.session_state["messages"]:
    with st.chat_message(role):
        st.write(text)

# Input field for user message
user_input = st.text_input("Your Message:", "", key="user_input")
if st.button("Send") and user_input:
    # Add user input to chat history
    st.session_state["messages"].append(("user", user_input))
    with st.chat_message("user"):
        st.write(user_input)

    # βœ… Use a minimal prompt format (no system message)
    formatted_messages = [{"role": "user", "content": user_input}]

    # βœ… Speed improvements: Increase max_tokens for full responses & ensure proper stopping
    response_data = st.session_state["model"].create_chat_completion(
        messages=formatted_messages, 
        max_tokens=110, temperature=0.5, top_p=0.8,
        stop=["\n", "<|endoftext|>"],  # βœ… Ensures responses end properly
        stream=False
    )

    # βœ… Extract and display response
    if "choices" in response_data and len(response_data["choices"]) > 0:
        choice = response_data["choices"][0]
        if "message" in choice and "content" in choice["message"]:
            response_text = choice["message"]["content"].strip()
            st.session_state["messages"].append(("assistant", response_text))
            with st.chat_message("assistant"):
                st.write(response_text)
        else:
            st.error("⚠️ Unable to generate a response. Please try again.")
    else:
        st.error("⚠️ No response received. Please ask again.")