import os import requests import streamlit as st from llama_cpp import Llama # ✅ Streamlit Page Config (Must be first) st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered") # ✅ Define model path MODEL_PATH = "./Phi-3-mini-4k-instruct-q4.gguf" MODEL_URL = "https://huggingface.co./microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf" # ✅ Check if model exists, otherwise download if not os.path.exists(MODEL_PATH): st.info("Downloading the model file. Please wait...") try: with requests.get(MODEL_URL, stream=True) as response: response.raise_for_status() # Stops the script if download fails with open(MODEL_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) st.success("Model downloaded successfully!") except requests.exceptions.HTTPError as e: st.error("🚨 Model download failed. Please try again later.") st.stop() # ✅ Load optimized model with reduced context length try: if "model" not in st.session_state: st.session_state["model"] = Llama( model_path=MODEL_PATH, n_ctx=256, # ✅ Lower memory usage, speeds up responses n_threads=2, # Matches available vCPUs numa=True, n_batch=64 # ✅ Faster token processing ) except Exception as e: st.error("🚨 Error loading model. Please restart the application.") st.stop() # 🌟 User-Friendly Chat Interface st.title("🤖 Phi-3 Mini Chatbot") st.markdown("### Ask me anything and I'll provide helpful responses!") # Chat history if "messages" not in st.session_state: st.session_state["messages"] = [] # Display chat history for role, text in st.session_state["messages"]: with st.chat_message(role): st.write(text) # Input field for user message user_input = st.text_input("Your Message:", "", key="user_input") if st.button("Send") and user_input: # Add user input to chat history st.session_state["messages"].append(("user", user_input)) with st.chat_message("user"): st.write(user_input) # ✅ Use a minimal prompt format (no system message) formatted_messages = [{"role": "user", "content": user_input}] # ✅ Speed improvements: Increase max_tokens for full responses & ensure proper stopping response_data = st.session_state["model"].create_chat_completion( messages=formatted_messages, max_tokens=110, temperature=0.5, top_p=0.8, stop=["\n", "<|endoftext|>"], # ✅ Ensures responses end properly stream=False ) # ✅ Extract and display response if "choices" in response_data and len(response_data["choices"]) > 0: choice = response_data["choices"][0] if "message" in choice and "content" in choice["message"]: response_text = choice["message"]["content"].strip() st.session_state["messages"].append(("assistant", response_text)) with st.chat_message("assistant"): st.write(response_text) else: st.error("⚠️ Unable to generate a response. Please try again.") else: st.error("⚠️ No response received. Please ask again.")