import os
import requests
import streamlit as st
from llama_cpp import Llama

# ✅ Streamlit Page Config (Must be first)
st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")

# ✅ Define model path
MODEL_PATH = "./Phi-3-mini-4k-instruct-q4.gguf"
MODEL_URL = "https://huggingface.co./microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"

# ✅ Check if model exists, otherwise download
if not os.path.exists(MODEL_PATH):
    st.info("Downloading the model file. Please wait...")
    try:
        with requests.get(MODEL_URL, stream=True) as response:
            response.raise_for_status()  # Stops the script if download fails
            with open(MODEL_PATH, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        st.success("Model downloaded successfully!")
    except requests.exceptions.HTTPError as e:
        st.error("🚨 Model download failed. Please try again later.")
        st.stop()

# ✅ Load optimized model with reduced context length
try:
    if "model" not in st.session_state:
        st.session_state["model"] = Llama(
            model_path=MODEL_PATH,
            n_ctx=256,  # ✅ Lower memory usage, speeds up responses
            n_threads=2,  # Matches available vCPUs
            numa=True,
            n_batch=64  # ✅ Faster token processing
        )
except Exception as e:
    st.error("🚨 Error loading model. Please restart the application.")
    st.stop()

# 🌟 User-Friendly Chat Interface
st.title("🤖 Phi-3 Mini Chatbot")
st.markdown("### Ask me anything and I'll provide helpful responses!")

# Chat history
if "messages" not in st.session_state:
    st.session_state["messages"] = []

# Display chat history
for role, text in st.session_state["messages"]:
    with st.chat_message(role):
        st.write(text)

# Input field for user message
user_input = st.text_input("Your Message:", "", key="user_input")
if st.button("Send") and user_input:
    # Add user input to chat history
    st.session_state["messages"].append(("user", user_input))
    with st.chat_message("user"):
        st.write(user_input)

    # ✅ Use a minimal prompt format (no system message)
    formatted_messages = [{"role": "user", "content": user_input}]

    # ✅ Speed improvements: Increase max_tokens for full responses & ensure proper stopping
    response_data = st.session_state["model"].create_chat_completion(
        messages=formatted_messages, 
        max_tokens=110, temperature=0.5, top_p=0.8,
        stop=["\n", "<|endoftext|>"],  # ✅ Ensures responses end properly
        stream=False
    )

    # ✅ Extract and display response
    if "choices" in response_data and len(response_data["choices"]) > 0:
        choice = response_data["choices"][0]
        if "message" in choice and "content" in choice["message"]:
            response_text = choice["message"]["content"].strip()
            st.session_state["messages"].append(("assistant", response_text))
            with st.chat_message("assistant"):
                st.write(response_text)
        else:
            st.error("⚠️ Unable to generate a response. Please try again.")
    else:
        st.error("⚠️ No response received. Please ask again.")