Spaces:
Running
Running
import os | |
import requests | |
import streamlit as st | |
from llama_cpp import Llama | |
# β Streamlit Page Config (Must be first) | |
st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered") | |
# β Define model path | |
MODEL_PATH = "./Phi-3-mini-4k-instruct-q4.gguf" | |
MODEL_URL = "https://huggingface.co./microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf" | |
# β Check if model exists, otherwise download | |
if not os.path.exists(MODEL_PATH): | |
st.info("Downloading the model file. Please wait...") | |
try: | |
with requests.get(MODEL_URL, stream=True) as response: | |
response.raise_for_status() # Stops the script if download fails | |
with open(MODEL_PATH, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
st.success("Model downloaded successfully!") | |
except requests.exceptions.HTTPError as e: | |
st.error("π¨ Model download failed. Please try again later.") | |
st.stop() | |
# β Load optimized model with reduced context length | |
try: | |
if "model" not in st.session_state: | |
st.session_state["model"] = Llama( | |
model_path=MODEL_PATH, | |
n_ctx=256, # β Lower memory usage, speeds up responses | |
n_threads=2, # Matches available vCPUs | |
numa=True, | |
n_batch=64 # β Faster token processing | |
) | |
except Exception as e: | |
st.error("π¨ Error loading model. Please restart the application.") | |
st.stop() | |
# π User-Friendly Chat Interface | |
st.title("π€ Phi-3 Mini Chatbot") | |
st.markdown("### Ask me anything and I'll provide helpful responses!") | |
# Chat history | |
if "messages" not in st.session_state: | |
st.session_state["messages"] = [] | |
# Display chat history | |
for role, text in st.session_state["messages"]: | |
with st.chat_message(role): | |
st.write(text) | |
# Input field for user message | |
user_input = st.text_input("Your Message:", "", key="user_input") | |
if st.button("Send") and user_input: | |
# Add user input to chat history | |
st.session_state["messages"].append(("user", user_input)) | |
with st.chat_message("user"): | |
st.write(user_input) | |
# β Use a minimal prompt format (no system message) | |
formatted_messages = [{"role": "user", "content": user_input}] | |
# β Speed improvements: Increase max_tokens for full responses & ensure proper stopping | |
response_data = st.session_state["model"].create_chat_completion( | |
messages=formatted_messages, | |
max_tokens=110, temperature=0.5, top_p=0.8, | |
stop=["\n", "<|endoftext|>"], # β Ensures responses end properly | |
stream=False | |
) | |
# β Extract and display response | |
if "choices" in response_data and len(response_data["choices"]) > 0: | |
choice = response_data["choices"][0] | |
if "message" in choice and "content" in choice["message"]: | |
response_text = choice["message"]["content"].strip() | |
st.session_state["messages"].append(("assistant", response_text)) | |
with st.chat_message("assistant"): | |
st.write(response_text) | |
else: | |
st.error("β οΈ Unable to generate a response. Please try again.") | |
else: | |
st.error("β οΈ No response received. Please ask again.") | |